Python regex 模块,UNICODE 实例源码

我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用regex.UNICODE

项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, pattern):
        # type: (Union[Text, regex._pattern_type, re._pattern_type]) -> None
        """
        :param pattern:
            String pattern, or pre-compiled regex.

            IMPORTANT:  If you specify your own compiled regex, be sure to
            add the ``UNICODE`` flag for Unicode support!
        """
        super(Regex, self).__init__()

        self.regex = (
            pattern
                if isinstance(pattern, (regex._pattern_type, re._pattern_type))
                else regex.compile(pattern, regex.UNICODE)
        )
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, pattern, keys=None):
        # type: (Union[Text, regex._pattern_type, re._pattern_type], Optional[Sequence[Text]]) -> None
        """
        :param pattern:
            Regex used to split incoming string values.

            IMPORTANT:  If you specify your own compiled regex, be sure
            to add the ``UNICODE`` flag for Unicode support!

        :param keys:
            If set, the resulting list will be converted into an
            OrderedDict, using the specified keys.

            IMPORTANT:  If ``keys`` is set, the split value's length
            must be less than or equal to ``len(keys)``.
        """
        super(Split, self).__init__()

        self.regex = (
            pattern
                if isinstance(pattern, (regex._pattern_type, re._pattern_type))
                else regex.compile(pattern, regex.UNICODE)
        )

        self.keys = keys
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, leading=r'[\p{C}\s]+', trailing=r'[\p{C}\s]+'):
        # type: (Text, Text) -> None
        """
        :param leading:
            Regex to match at the start of the string.

        :param trailing:
            Regex to match at the end of the string.
        """
        super(Strip, self).__init__()

        if leading:
            self.leading = regex.compile(
                r'^{pattern}'.format(pattern=leading),
                regex.UNICODE,
            )
        else:
            self.leading = None

        if trailing:
            self.trailing = regex.compile(
                r'{pattern}$'.format(pattern=trailing),
                regex.UNICODE,
            )
        else:
            self.trailing = None
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, encoding='utf-8', normalize=True):
        # type: (Text, bool) -> None
        """
        :param encoding:
            Used to decode non-unicode values.

        :param normalize:
            Whether to normalize the resulting value:
                - Convert to NFC form.
                - Remove non-printable characters.
                - Convert all line endings to unix-style ('\n').
        """
        super(Unicode, self).__init__()

        self.encoding   = encoding
        self.normalize  = normalize

        if self.normalize:
            #
            # Compile the regex that we will use to remove non-
            # printables from the resulting unicode.
            # http://www.regular-expressions.info/unicode.html#category
            #
            # Note: using a double negative so that we can exclude
            # newlines, which are technically considered control chars.
            # http://stackoverflow.com/a/3469155
            #
            self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_special_escapes(self):
        self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx")[1], 'bx')
        self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd")[1], 'bx')
        self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx",
          regex.LOCALE)[1], 'bx')
        self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd",
          regex.LOCALE)[1], 'bx')
        self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx",
          regex.UNICODE)[1], u'bx')
        self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd",
          regex.UNICODE)[1], u'bx')

        self.assertEqual(regex.search(r"^abc$", "\nabc\n", regex.M)[0], 'abc')
        self.assertEqual(regex.search(r"^\Aabc\Z$", "abc", regex.M)[0], 'abc')
        self.assertEqual(regex.search(r"^\Aabc\Z$", "\nabc\n", regex.M), None)

        self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx")[1],
          u'bx')
        self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd")[1],
          u'bx')
        self.assertEqual(regex.search(ur"^abc$", u"\nabc\n", regex.M)[0],
          u'abc')
        self.assertEqual(regex.search(ur"^\Aabc\Z$", u"abc", regex.M)[0],
          u'abc')
        self.assertEqual(regex.search(ur"^\Aabc\Z$", u"\nabc\n", regex.M),
          None)

        self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a")[0], '1aa! a')
        self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a",
          regex.LOCALE)[0], '1aa! a')
        self.assertEqual(regex.search(ur"\d\D\w\W\s\S", u"1aa! a",
          regex.UNICODE)[0], u'1aa! a')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bigcharset(self):
        self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222")[1],
          u'\u2222')
        self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222",
          regex.UNICODE)[1], u'\u2222')
        self.assertEqual(u"".join(regex.findall(u".",
          u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)),
          u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117')
        self.assertEqual(u"".join(regex.findall(ur"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]",
          u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)),
          u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117')
        self.assertEqual(u"".join(regex.findall(ur"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117",
          u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)),
          u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_ascii_and_unicode_flag(self):
        # Unicode patterns.
        for flags in (0, regex.UNICODE):
            pat = regex.compile(u'\xc0', flags | regex.IGNORECASE)
            self.assertEqual(bool(pat.match(u'\xe0')), True)
            pat = regex.compile(u'\w', flags)
            self.assertEqual(bool(pat.match(u'\xe0')), True)

        pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'\w', regex.ASCII)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'(?a)\w')
        self.assertEqual(pat.match(u'\xe0'), None)

        # String patterns.
        for flags in (0, regex.ASCII):
            pat = regex.compile('\xc0', flags | regex.IGNORECASE)
            self.assertEqual(pat.match('\xe0'), None)
            pat = regex.compile('\w')
            self.assertEqual(pat.match('\xe0'), None)

        self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda:
          regex.compile('(?au)\w'))
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def regex_match(text, pattern):
    """Test if a regex pattern is contained within a text."""
    try:
        pattern = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
        )
    except BaseException:
        return False
    return pattern.search(text) is not None
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def regex_match(text, pattern):
    """Test if a regex pattern is contained within a text."""
    try:
        pattern = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
        )
    except BaseException:
        return False
    return pattern.search(text) is not None
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
项目:hatespeech    作者:lukovnikov    | 项目源码 | 文件源码
def remove_elongation(text):

        return regex.sub(r'(.)\1{3,}', r'\1\1', text, flags=regex.UNICODE)
项目:hatespeech    作者:lukovnikov    | 项目源码 | 文件源码
def clean(text):

        #removing extra spaces
        text = regex.sub(r'[\s\n]+', ' ', text, flags=regex.UNICODE)

        # todo : add more cleaning methods

        return text
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def make_xpath_ranges(html, phrase):
    '''Given a HTML string and a `phrase`, build a regex to find offsets
    for the phrase, and then build a list of `XPathRange` objects for
    it.  If this fails, return empty list.

    '''
    if not html:
        return []
    if not isinstance(phrase, unicode):
        try:
            phrase = phrase.decode('utf8')
        except:
            logger.info('failed %r.decode("utf8")', exc_info=True)
            return []
    phrase_re = re.compile(
        phrase, flags=re.UNICODE | re.IGNORECASE | re.MULTILINE)
    spans = []
    for match in phrase_re.finditer(html, overlapped=False):
        spans.append(match.span())  # a list of tuple(start, end) char indexes

    # now run fancy aligner magic to get xpath info and format them as
    # XPathRange per above
    try:
        xpath_ranges = list(char_offsets_to_xpaths(html, spans))
    except:
        logger.info('failed to get xpaths', exc_info=True)
        return []
    ranges = []
    for xpath_range in filter(None, xpath_ranges):
        ranges.append(dict(
            start=dict(node=xpath_range.start_xpath,
                       idx=xpath_range.start_offset + 1),
            end=dict(node=xpath_range.end_xpath,
                     idx=xpath_range.end_offset)))

    return ranges
项目:grako    作者:apalala    | 项目源码 | 文件源码
def eval_escapes(s):
    """
    Given a string, evaluate escape sequences starting with backslashes as
    they would be evaluated in Python source code. For a list of these
    sequences, see: https://docs.python.org/3/reference/lexical_analysis.html

    This is not the same as decoding the whole string with the 'unicode-escape'
    codec, because that provides no way to handle non-ASCII characters that are
    literally present in the string.
    """
    # by Rob Speer

    escape_sequence_re = re.compile(
        r'''
        ( \\U........      # 8-digit Unicode escapes
        | \\u....          # 4-digit Unicode escapes
        | \\x..            # 2-digit Unicode escapes
        | \\[0-7]{1,3}     # Octal character escapes
        | \\N\{[^}]+\}     # Unicode characters by name
        | \\[\\'"abfnrtv]  # Single-character escapes
        )''',
        re.UNICODE | re.VERBOSE
    )

    def decode_match(match):
        return codecs.decode(match.group(0), 'unicode-escape')

    return escape_sequence_re.sub(decode_match, s)