Python regex 模块,sub() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用regex.sub()

项目:DL2W    作者:gauravmm    | 项目源码 | 文件源码
def extract_video_titles(input_file, output_file):
    with open(input_file, encoding='utf-8') as file:
        titles = json.load(file)

    count = 0

    for video_id, title in titles.items():
        # Remove all punctuation and symbols in unicode
        title = re.sub(r'[\p{P}\p{S}]+', '', title)
        titles[video_id] = title.split(' ')

        count += 1
        print('{}: {}'.format(count, video_id), end='\r', flush=True)

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(titles, file)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_1140(self):
        # regex.sub(x, y, u'') should return u'', not '', and
        # regex.sub(x, y, '') should return '', not u''.
        # Also:
        # regex.sub(x, y, unicode(x)) should return unicode(y), and
        # regex.sub(x, y, str(x)) should return
        #     str(y) if isinstance(y, str) else unicode(y).
        for x in 'x', u'x':
            for y in 'y', u'y':
                z = regex.sub(x, y, u'')
                self.assertEqual((type(z), z), (unicode, u''))
                z = regex.sub(x, y, '')
                self.assertEqual((type(z), z), (str, ''))
                z = regex.sub(x, y, unicode(x))
                self.assertEqual((type(z), z), (unicode, unicode(y)))
                z = regex.sub(x, y, str(x))
                self.assertEqual((type(z), z), (type(y), y))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_symbolic_refs(self):
        self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda:
          regex.sub('(?P<a>x)', r'\g<a', 'xx'))
        self.assertRaisesRegex(regex.error, self.MISSING_GROUP_NAME, lambda:
          regex.sub('(?P<a>x)', r'\g<', 'xx'))
        self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda:
          regex.sub('(?P<a>x)', r'\g', 'xx'))
        self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda:
          regex.sub('(?P<a>x)', r'\g<a a>', 'xx'))
        self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda:
          regex.sub('(?P<a>x)', r'\g<1a1>', 'xx'))
        self.assertRaisesRegex(IndexError, self.UNKNOWN_GROUP_I, lambda:
          regex.sub('(?P<a>x)', r'\g<ab>', 'xx'))

        # The new behaviour of unmatched but valid groups is to treat them like
        # empty matches in the replacement template, like in Perl.
        self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
        self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')

        # The old behaviour was to raise it as an IndexError.
        self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda:
          regex.sub('(?P<a>x)', r'\g<-1>', 'xx'))
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
项目:toshi-services-lib    作者:toshiapp    | 项目源码 | 文件源码
def prepare(self):

        # log the full request and headers if the log level is set to debug
        if log.level == 10:
            log.debug("Preparing request: {} {}".format(self.request.method, self.request.path))
            for k, v in self.request.headers.items():
                log.debug("{}: {}".format(k, v))

        if 'X-Forwarded-Proto' in self.request.headers:
            proto = self.request.headers['X-Forwarded-Proto']
        else:
            proto = self.request.protocol
        if proto != 'https' and 'enforce_https' in self.application.config['general']:
            mode = self.application.config['general']['enforce_https']
            if mode == 'reject':
                self.set_status(404)
                self.finish()
            else:
                # default to redirect
                self.redirect(regex.sub(r'^([^:]+)', 'https', self.request.full_url()), permanent=True)

        return super().prepare()
项目:bubblesub    作者:rr-    | 项目源码 | 文件源码
def eval_expr(expr):
    import ast
    import operator as op

    op = {
        ast.Add: op.add,
        ast.Sub: op.sub,
        ast.Mult: op.mul,
        ast.Div: op.truediv,
        ast.Pow: op.pow,
        ast.BitXor: op.xor,
        ast.USub: op.neg,
    }

    def eval_(node):
        if isinstance(node, ast.Num):
            return fractions.Fraction(node.n)
        elif isinstance(node, ast.BinOp):
            return op[type(node.op)](eval_(node.left), eval_(node.right))
        elif isinstance(node, ast.UnaryOp):
            return op[type(node.op)](eval_(node.operand))
        raise TypeError(node)

    return eval_(ast.parse(str(expr), mode='eval').body)
项目:joyodb    作者:leoboiko    | 项目源码 | 文件源码
def test_against_kanjidic(self):
        kanjidic_data = {}
        with open(kanjidic_file, 'rt') as f:
            for line in f:
                kanji, *fields = line.strip().split()

                if kanji in TestLoadedData.kanjis.keys():
                    # kanjidic marks bound affixes with '-', but we don't
                    fields = [re.sub('-$', '', f) for f in fields]
                    fields = [re.sub('^-', '', f) for f in fields]

                    kanjidic_data[kanji] = fields

        for kanji in joyodb.loaded_data.kanjis:
            for reading in kanji.readings:
                if reading.variation_of:
                    continue # variations are not in kanjidic
                if (kanji.kanji, reading.reading) not in KANJIDIC_MISSING_READINGS:
                    self.assertIn(reading.reading, kanjidic_data[kanji.kanji])
项目:transformer    作者:Kyubyong    | 项目源码 | 文件源码
def make_vocab(fpath, fname):
    '''Constructs vocabulary.

    Args:
      fpath: A string. Input file path.
      fname: A string. Output file name.

    Writes vocabulary line by line to `preprocessed/fname`
    '''  
    text = codecs.open(fpath, 'r', 'utf-8').read()
    text = regex.sub("[^\s\p{Latin}']", "", text)
    words = text.split()
    word2cnt = Counter(words)
    if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
    with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
        fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
        for word, cnt in word2cnt.most_common(len(word2cnt)):
            fout.write(u"{}\t{}\n".format(word, cnt))
项目:ShadowSocksShare-OpenShift    作者:the0demiurge    | 项目源码 | 文件源码
def request_url(url, headers=None, name=''):
    print('req', url)

    data = set()
    servers = list()
    try:
        response = requests.get(url, headers=headers, verify=False).text
        data.update(map(lambda x: re.sub('\s', '', x), re.findall('ssr?://[a-zA-Z0-9=]+', response)))
        soup = BeautifulSoup(response, 'html.parser')
        title = soup.find('title').text

        info = {'message': '', 'url': url, 'name': str(title)}
        for i, server in enumerate(data):
            try:
                servers.append(parse(server, ' '.join([title, name, str(i)])))
            except Exception as e:
                logging.exception(e, stack_info=False)
                print('URL:', url, 'SERVER', server)
    except Exception as e:
        print(url)
        logging.exception(e, stack_info=False)
        return [], {'message': str(e), 'url': '', 'name': ''}
    return servers, info
项目:siamese_sentiment    作者:jcavalieri8619    | 项目源码 | 文件源码
def generate_char_list(string, strip_html=True):
    if strip_html:
        s = strip_html_tags(string.lower())
    else:
        s = string.lower()
    normalized_string = regex.sub(r'\s+', r' ', s)  # change any kind of whitespace to a single space

    list_norm_chars = regex.findall(r"\w|[?!'#@$:\"&*=,]", normalized_string)
    return list_norm_chars
项目:siamese_sentiment    作者:jcavalieri8619    | 项目源码 | 文件源码
def generate_word_list(string, strip_html=True):
    if strip_html:
        s = strip_html_tags(string.lower())
    else:
        s = string.lower()

    normalized_string = regex.sub(r"\s+", r' ', s)  # change any kind of whitespace to a single space

    # list of words all words seen during training including strings like '!!!' , '??', '....'
    # as these repeated punctuations tend to imply more than the're gramatical meaning
    list_normalized_string = regex.findall(r"\b\w+[']?\w*\b|\!+|\?+|\.{3,}", normalized_string)
    return list_normalized_string
项目:siamese_sentiment    作者:jcavalieri8619    | 项目源码 | 文件源码
def strip_html_tags(string, verbose=False):
    p = regex.compile(r'<.*?>')
    return p.sub(' ', string)
项目:whaaaaat    作者:finklabs    | 项目源码 | 文件源码
def remove_ansi_escape_sequences(text):
    # http://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python
    # also clean up the line endings
    return regex.sub(r'(\x9b|\x1b\[)[0-?]*[ -\/]*[@-~]|\ *\r', '', text)


# helper for running sut as subprocess within pty
# does two things
# * test app running in pty in subprocess
# * get test coverage from subprocess

# docu:
# http://blog.fizyk.net.pl/blog/gathering-tests-coverage-for-subprocesses-in-python.html
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_basic_regex_sub(self):
        self.assertEqual(regex.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
        self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
          '9.3 -3 24x100y')
        self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
          '9.3 -3 23x99y')

        self.assertEqual(regex.sub('.', lambda m: r"\n", 'x'), "\\n")
        self.assertEqual(regex.sub('.', r"\n", 'x'), "\n")

        self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
        self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(regex.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'),
          'xxxx')
        self.assertEqual(regex.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')

        self.assertEqual(regex.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D',
          'a'), "\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D")
        self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), "\t\n\v\r\f\a")
        self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), chr(9) + chr(10)
          + chr(11) + chr(13) + chr(12) + chr(7))

        self.assertEqual(regex.sub(r'^\s*', 'X', 'test'), 'Xtest')

        self.assertEqual(regex.sub(ur"x", ur"\x0A", u"x"), u"\n")
        self.assertEqual(regex.sub(ur"x", ur"\u000A", u"x"), u"\n")
        self.assertEqual(regex.sub(ur"x", ur"\U0000000A", u"x"), u"\n")
        self.assertEqual(regex.sub(ur"x", ur"\N{LATIN CAPITAL LETTER A}",
          u"x"), u"A")

        self.assertEqual(regex.sub(r"x", r"\x0A", "x"), "\n")
        self.assertEqual(regex.sub(r"x", r"\u000A", "x"), "\\u000A")
        self.assertEqual(regex.sub(r"x", r"\U0000000A", "x"),
          "\\U0000000A")
        self.assertEqual(regex.sub(r"x", r"\N{LATIN CAPITAL LETTER A}",
          "x"), "\\N{LATIN CAPITAL LETTER A}")
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_449964(self):
        # Fails for group followed by other escape.
        self.assertEqual(regex.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
          "xx\bxx\b")
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_449000(self):
        # Test for sub() on escaped characters.
        self.assertEqual(regex.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
          "abc\ndef\n")
        self.assertEqual(regex.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
          "abc\ndef\n")
        self.assertEqual(regex.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
          "abc\ndef\n")
        self.assertEqual(regex.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
          "abc\ndef\n")
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_114660(self):
        self.assertEqual(regex.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
          'hello there')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_462270(self):
        # Test for empty sub() behaviour, see SF bug #462270
        self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b-d-')
        self.assertEqual(regex.sub('(?V1)x*', '-', 'abxd'), '-a-b--d-')
        self.assertEqual(regex.sub('x+', '-', 'abxd'), 'ab-d')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_dollar_matches_twice(self):
        # $ matches the end of string, and just before the terminating \n.
        pattern = regex.compile('$')
        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
        self.assertEqual(pattern.sub('#', '\n'), '#\n#')

        pattern = regex.compile('$', regex.MULTILINE)
        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#')
        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_unmatched_in_sub(self):
        # Issue 1519638.
        self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "xy"), 'y-x')
        self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "xy"), 'y-x-')
        self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "x"), '-x')
        self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "x"), '-x-')
        self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "y"), 'y-')
        self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "y"), 'y--')
项目:turboparser-semafor    作者:ReutersMedia    | 项目源码 | 文件源码
def split_if_contraction(self, word):
        # Handle preposition+determiner contractions.
        word = regex.sub(ur'^(A|a)u$', ur'à le', word)
        word = regex.sub(ur'^(A|a)uquel$', ur'à lequel', word)
        word = regex.sub(ur'^(A|a)ux$', ur'à les', word)
        word = regex.sub(ur'^(A|a)uxquels$', ur'à lesquels', word)
        word = regex.sub(ur'^(A|a)uxquelles$', ur'à lesquelles', word)
        word = regex.sub(ur'^(D|d)u$', ur'de le', word)
        word = regex.sub(ur'^(D|d)uquel$', ur'de lequel', word)
        word = regex.sub(ur'^(D|d)es$', ur'de les', word)
        word = regex.sub(ur'^(D|d)esquels$', ur'de lesquels', word)
        word = regex.sub(ur'^(D|d)esquelles$', ur'de lesquelles', word)

        return word
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def _clean_characters(self, characters):
        """Clean characters (e.g. convert \t to a space)."""
        if self._lower:
            characters = characters.lower()

        characters = regex.sub(r'\t|\s+|\u200d', ' ', characters)
        characters = regex.sub(r'`', "'", characters)
        characters = regex.sub(r'–', "-", characters)
        return characters
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def compile_replace(pattern, repl, flags=0):
        """Construct a method that can be used as a replace method for `sub`, `subn`, etc."""

        call = None
        if pattern is not None and isinstance(pattern, REGEX_TYPE):
            if isinstance(repl, (compat.string_type, compat.binary_type)):
                repl = ReplaceTemplate(pattern, repl, bool(flags & FORMAT))
                call = Replace(
                    functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash
                )
            elif isinstance(repl, Replace):
                if flags:
                    raise ValueError("Cannot process flags argument with a compiled pattern!")
                if repl.pattern_hash != hash(pattern):
                    raise ValueError("Pattern hash doesn't match hash in compiled replace!")
                call = repl
            elif isinstance(repl, ReplaceTemplate):
                if flags:
                    raise ValueError("Cannot process flags argument with a ReplaceTemplate!")
                call = Replace(
                    functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash
                )
            else:
                raise TypeError("Not a valid type!")
        else:
            raise TypeError("Pattern must be a compiled regular expression!")
        return call

    # Convenience methods like re has, but slower due to overhead on each call.
    # It is recommended to use compile_search and compile_replace
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):
        """Wrapper for `sub`."""

        is_replace = _is_replace(repl)
        is_string = isinstance(repl, (compat.string_type, compat.binary_type))
        if is_replace and repl.use_format:
            raise ValueError("Compiled replace cannot be a format object!")

        pattern = compile_search(pattern, flags)
        return regex.sub(
            pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string,
            count, flags, pos, endpos, concurrent, **kwargs
        )
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs):  # noqa B002
        """Wrapper for `subf`."""

        is_replace = _is_replace(format)
        is_string = isinstance(format, (compat.string_type, compat.binary_type))
        if is_replace and not format.use_format:
            raise ValueError("Compiled replace is not a format object!")

        pattern = compile_search(pattern, flags)
        rflags = FORMAT if is_string else 0
        return regex.sub(
            pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string,
            count, flags, pos, endpos, concurrent, **kwargs
        )
项目:docximport-sigil-plugin    作者:dougmassay    | 项目源码 | 文件源码
def build_html(fragment, css=False):
    fragment = regex.sub(r'<p([^>])*></p>', r'<p\1>&#160;</p>', fragment)
    css_link = ''
    if css:
        css_link = LINK_TEXT
    new = HTML.format(css_link, fragment)
    soup = gumbo_bs4.parse(new)
    return soup.serialize_xhtml()
项目:concernCapture    作者:ctfu    | 项目源码 | 文件源码
def splitWord(str):
    str = re.sub("[^A-Za-z]", "", str)
    words = re.split(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-z]A)(?=[A-Z])', str, flags=re.V1)
    return words
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
        return re.sub(r"[^\P{P}\']+", "", text)
项目:tailchaser    作者:thanos    | 项目源码 | 文件源码
def slugify(value):
    """
    Parameters
    ----------
    value: str
        the value to slug
    Convert spaces to hyphens.
    Remove characters that aren't alphanumerics, underscores, or hyphens.
    Convert to lowercase. Also strip leading and trailing whitespace.
    """
    value = regex.sub('[^\w\s-]', '', value).strip().lower()
    return regex.sub('[-\s]+', '-', value)
项目:neteasenews    作者:tricomm    | 项目源码 | 文件源码
def jsonFormat(year=2014, month=1, day=1, newsType=0):
    text = getJson(year, month, day, newsType)
    returnValue = list()
    if text.startswith('var data=') is True:
        tmp = re.sub(',*,', ',', text.lstrip('var data=').rstrip(';').replace('\n', '').replace(',[]', ''))
        if newsType is not 0:
            tmp = re.sub(r'(,|\{)([a-z]*?)(:)', r'\1"\2"\3', tmp)
            tmp = re.sub(r'(\[),(\{)', r'\1\2', tmp.replace('\\', '/'))
        try:
            tmpValue = json.loads(tmp, strict=False)
        except:
            return list()
        childClassification = getChildClassification(tmpValue[u'category'])
        if newsType is 1:
            valuelist = tmpValue[u'ent']
        else:
            valuelist = tmpValue[u'news']
        for list0 in valuelist:
            for list1 in list0:
                if list1 is not None:
                    if list1[u'l'].find('photoview') is -1 and list1[u'l'].find('blog') is -1:
                        returnValue.append(
                            [list1[u'p'].split()[0], list1[u'p'].split()[1], getSiteURL(newsType)[0],
                             childClassification[list1[u'c']], list1[u'l'],
                             list1[u't']])
        del tmp
        del text
        del tmpValue
        del childClassification
        del valuelist
        gc.collect()
    return returnValue
项目:neteasenews    作者:tricomm    | 项目源码 | 文件源码
def getnews(URL):
    date = str()
    html = networkExceptionCatch(URL)
    soup = BeautifulSoup(html, 'html.parser')
    alls = soup.find_all('div', id="endText")
    for div in alls:
        if div.find('script'):
            div = re.sub(r'<script.*?</script>', '', div)
        p_in_div = div.find_all('p')
        if len(p_in_div) is 0:
            p_in_div = re.sub(r'(<div id="endText">)(.*?)(</p>)(<p>)', r'\1\2\4', p_in_div)
        for p_tag in p_in_div:
            if p_tag.text is not None:
                date += p_tag.text + u'\n'
    return date
项目:stashpy    作者:afroisalreadyinu    | 项目源码 | 文件源码
def sub_pattern(self, match):
        match_dict = match.groupdict()
        pattern = GROK_PATTERNS[match_dict['pattern_name']]
        pattern_output_raw = match_dict['pattern_output']
        pattern_type_raw = match_dict['pattern_type']
        if pattern_output_raw:
            pattern_output = pattern_output_raw.lstrip(':')
            new_pattern = GROK_NEW_PATTERN.format(name=pattern_output, pattern=pattern)
            if pattern_type_raw:
                pattern_type = pattern_type_raw.lstrip(':')
                self.pattern_types[pattern_output] = __builtins__[pattern_type]
        else:
            new_pattern = pattern
        return regex.sub(GROK_REPLACE_PATTERN, self.sub_pattern, new_pattern)
项目:stashpy    作者:afroisalreadyinu    | 项目源码 | 文件源码
def grok_re_preprocess(re_pattern):
    traverser = PatternTraverser()
    new_pattern = regex.sub(GROK_REPLACE_PATTERN, traverser.sub_pattern, re_pattern)
    return new_pattern, traverser.pattern_types
项目:stashpy    作者:afroisalreadyinu    | 项目源码 | 文件源码
def test_collect_types(self):
        traverser = pattern_matching.PatternTraverser()
        re = regex.sub(pattern_matching.GROK_REPLACE_PATTERN,
                       traverser.sub_pattern,
                       'This is process %{POSINT:processid:int} running in %{PATH:process_dir}')
        self.assertDictEqual(traverser.pattern_types, {'processid': int})
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
        return re.sub(r"[^\P{P}\']+", "", text)
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def pinyin_transform(text):
    if re.search("?", text):
        return ""
    text = re.sub(
        unicodedata.normalize("NFD", "ü"),
        "ü",
        re.sub(
            unicodedata.normalize("NFD", "ê"),
            "ê",
            unicodedata.normalize("NFD", text)
        )
    )
    if re.search(
            "[aeiouêü]" + tones + "[aeiou]?[aeiouêü]" + tones + "",
            text.lower()):
        return ""
    text = text.lower()
    if not re.search(tones, text) and re.match("[1-5]", text):
        return re.sub("(\d)(\p{Ll})", "\1 \2", text)
    if re.search("[??,.?]", text):
        text = re.sub(
            "([??])$",
            lambda x: " y?" if x.group() == "?" else " bù",
            text
        )
        text = re.sub("([??])", r" \1 ", text)
        text = re.sub("([,.?])", r" \1 ", text)
        text = re.sub(" +", " ", text)
        text = re.sub("^ ", "", text)
        text = re.sub(" $", "", text)
        text = re.sub("\. \. \.", "...", text)
    text = re.sub("['\-]", " ", text)
    text = re.sub(
        "([aeiouêü]" + tones + "?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)",
        r"\1 \2",
        text
    )
    text = re.sub(" ([grn])$", r"\1", text)
    text = re.sub(" ([grn]) ", r"\1 ", text)

    return unicodedata.normalize("NFC", text)
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def sub_repeatedly(pattern, repl, term):
    while True:
        new_term = re.sub(pattern, repl, term)
        if new_term == term:
            return term
        term = new_term
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def decompose(text):
    def repl(match):
        k = match.group()
        if k in recomposer.keys():
            return recomposer[k]
        return k
    text = unicodedata.normalize("NFD", text)
    text = re.sub(".[" + BREVE + DIA + CARON + "]", repl, text)
    return text

# Remove grave accents; don't affect acute or composed diaeresis in ?? or
# uncomposed diaeresis in -??- (as in plural ?????? of ??????).
# NOTE: Translit must already be decomposed! See comment at top.
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def remove_grave_accents(word):
    def repl(match):
        k = match.group()
        if k in grave_deaccenter.keys():
            return grave_deaccenter[k]
        return k
    ru_removed = re.sub("[?????]", repl, word)
    return ru_removed
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def tr_sub(text, include_monosyllabic_jo_accent="", noadj="", noshto="", sub="", forceadj=""):
    if sub:
        subs = sub.split(",")
        for subpair in subs:
            subsplit = subpair.split("/")
            text = re.sub(subsplit[0], subsplit[1], text)

    return tr(text, None, None, include_monosyllabic_jo_accent, noadj, noshto, forceadj)

#for adjectives, pronouns
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def sub_repeatedly(pattern, repl, term):
    """apply sub() repeatedly until no change"""
    while True:
        new_term = re.sub(pattern, repl, term)
        if new_term == term:
            return term
        term = new_term

# If enabled, compare this module with new version of module in
# Module:User:Benwing2/ru-pron to make sure all pronunciations are the same.
# To check for differences, go to Template:tracking/ru-pron/different-pron
# and look at what links to the page.
# test_new_ru_pron_module = False

# If enabled, do new code for final -?; else, the old way
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def phon_respelling(text, remove_grave):
    text = re.sub("[" + CFLEX + DUBGR + DOTABOVE + DOTBELOW + "?]", "", text)
    # Remove grave accents from annotations but maybe not from phonetic respelling
    if remove_grave:
        text = com.remove_grave_accents(text)
    return text


# Return the actual IPA corresponding to Cyrillic text. ADJ, GEN, BRACKET
# and POS are as in [[Template:ru-IPA]]. If IS_TRANFORMED is true, the text
# has already been passed through m_ru_translit.apply_tr_fixes(); otherwise,
# this will be done.
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def IPA_to_CMUBET(text):
    """Convert IPA to CMUBET for US English.

    Use `IPA`_ and symbol set used in Wiktionary and
    `CMUBET`_ symbol set used in CMUDict.

    .. _IPA: https://en.wiktionary.org/wiki/Module:IPA/data/symbols
    .. _CMUBET: https://cmusphinx.github.io/wiki/cmubet/

    Parameters
    ----------
    text : string
        String of IPA text parsed from Wiktionary.

    Returns
    -------
    string
        Converted CMUBET text.
    """
    text = re.sub("??", ":", text)
    text = text.lstrip("/[").rstrip("]/")
    text = text.strip("'-!$")
    text += " "
    CMUBET_lst = []
    i = 0
    while i < len(text) - 1:
        if text[i:i+2] in i2c_lookup.keys():
            CMUBET_lst.append(i2c_lookup[text[i:i+2]])
            i += 1
        elif text[i] in i2c_lookup.keys():
            CMUBET_lst.append(i2c_lookup[text[i]])
        i += 1
    return " ".join(CMUBET_lst)
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def syllabify(text):
    def repl(match):
        a, b, c, d = \
            match.group(1), match.group(2), match.group(3), match.group(4)
        if re.match(weak_h, b + c) or re.match(aspirate + "h", b + " " + c):
            b, c = "", b + c
        if c == "" and b != "":
            c, b = b, ""
        return a + b + "." + c + d

    for _ in range(2):
        text = re.sub(syllabify_pattern, repl, text)
    return text
项目:neuralmonkey    作者:ufal    | 项目源码 | 文件源码
def normalize_quotes(token):
                token = re.sub(r"-$", '', token)
                token = re.sub(r"``", '\u201c', token)
                token = re.sub(r"''", '\u201d', token)
                return token
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
        return re.sub(r"[^\P{P}\']+", "", text)
项目:hatespeech    作者:lukovnikov    | 项目源码 | 文件源码
def remove_elongation(text):

        return regex.sub(r'(.)\1{3,}', r'\1\1', text, flags=regex.UNICODE)
项目:hatespeech    作者:lukovnikov    | 项目源码 | 文件源码
def clean(text):

        #removing extra spaces
        text = regex.sub(r'[\s\n]+', ' ', text, flags=regex.UNICODE)

        # todo : add more cleaning methods

        return text