Python regex 模块,compile() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用regex.compile()

项目:dprr-django    作者:kingsdigitallab    | 项目源码 | 文件源码
def parse_primary_source(text):
    """given a primary source text reference it returns the
    abbrev of the primary source
    """

    psource = ""

    ref_regex = regex.compile(r"""
                (?P<psource>(\w+\.?\s?)+)
                (\(?\d\)\s?)+
                """, regex.VERBOSE)

    res = regex.search(ref_regex, text)

    if res:
        if res.group(psource):
            return res.group(psource)
        else:
            return None
    else:
        return None
项目:whaaaaat    作者:finklabs    | 项目源码 | 文件源码
def expect_regex(self, pattern):
        """Read until matches pattern or timeout."""
        # inspired by pexpect/pty_spawn and  pexpect/expect.py expect_loop
        end_time = time.time() + self.timeout
        buf = ''
        prog = regex.compile(pattern)
        while (end_time - time.time()) > 0.0:
            # switch to nonblocking read
            reads, _, _ = select.select([self.fd], [], [], end_time - time.time())
            if len(reads) > 0:
                try:
                    buf += self.read()
                except EOFError:
                    assert prog.match(buf) is not None, \
                        'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
                if prog.match(buf):
                    return True
            else:
                # do not eat up CPU when waiting for the timeout to expire
                time.sleep(self.timeout/10)
        assert prog.match(buf) is not None, \
            'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def test_pass_precompiled_regex(self):
        """
        You can alternatively provide a precompiled regex to the Filter
        instead of a string pattern.
        """
        # Compile our own pattern so that we can specify the
        # ``IGNORECASE`` flag.
        # Note that you are responsible for adding the ``UNICODE`` flag
        # to your compiled regex!
        # noinspection SpellCheckingInspection
        pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE)

        self.assertFilterPasses(
            self._filter('test march of the TEST penguins', pattern=pattern),
            ['test', 'TEST'],
        )
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def test_pass_regex_library_support(self):
        """
        The Regex Filter also supports precompiled patterns using the
        ``regex`` library.
        """
        # Roughly, "Hi there!" in Burmese.
        word = '\u101f\u102d\u102f\u1004\u103a\u1038'

        # Note that :py:func:`regex.compile` automatically adds the
        # ``UNICODE`` flag for you when the pattern is a unicode.
        pattern = regex.compile(r'\w+')

        self.assertFilterPasses(
            self._filter(word, pattern=pattern),
            [word],
        )
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def test_pass_precompiled_regex(self):
        """
        You can alternatively provide a precompiled regex to the Filter
        instead of a string pattern.
        """
        # Compile our own pattern so that we can specify the
        # ``IGNORECASE`` flag.
        # Note that you are responsible for adding the ``UNICODE`` flag
        # to your compiled regex!
        # noinspection SpellCheckingInspection
        pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE)

        self.assertFilterPasses(
            self._filter('test march of the TEST penguins', pattern=pattern),
            ['', ' march of the ', ' penguins'],
        )
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def test_pass_regex_library_support(self):
        """
        The Regex Filter also supports precompiled patterns using the
        ``regex`` library.
        """
        # Roughly, "Hi there!" in Burmese.
        word = '\u101f\u102d\u102f\u1004\u103a\u1038!'

        # Note that :py:func:`regex.compile` automatically adds the
        # ``UNICODE`` flag for you when the pattern is a unicode.
        pattern = regex.compile(r'\w+')

        self.assertFilterPasses(
            self._filter(word, pattern=pattern),
            ['', '!'],
        )
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, pattern, keys=None):
        # type: (Union[Text, regex._pattern_type, re._pattern_type], Optional[Sequence[Text]]) -> None
        """
        :param pattern:
            Regex used to split incoming string values.

            IMPORTANT:  If you specify your own compiled regex, be sure
            to add the ``UNICODE`` flag for Unicode support!

        :param keys:
            If set, the resulting list will be converted into an
            OrderedDict, using the specified keys.

            IMPORTANT:  If ``keys`` is set, the split value's length
            must be less than or equal to ``len(keys)``.
        """
        super(Split, self).__init__()

        self.regex = (
            pattern
                if isinstance(pattern, (regex._pattern_type, re._pattern_type))
                else regex.compile(pattern, regex.UNICODE)
        )

        self.keys = keys
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def __exit__(self, exc_type, exc_value, tb):
        if exc_type is None:
            try:
                exc_name = self.expected.__name__
            except AttributeError:
                exc_name = str(self.expected)
            raise self.failureException(
                "%s not raised" % exc_name)
        if not issubclass(exc_type, self.expected):
            # let unexpected exceptions pass through
            return False
        self.exception = exc_value # store for later retrieval
        if self.expected_regexp is None:
            return True

        expected_regexp = self.expected_regexp
        if isinstance(expected_regexp, basestring):
            expected_regexp = re.compile(expected_regexp)
        if not expected_regexp.search(str(exc_value)):
            raise self.failureException('"%s" does not match "%s"' %
                     (expected_regexp.pattern, str(exc_value)))
        return True
项目:turboparser-semafor    作者:ReutersMedia    | 项目源码 | 文件源码
def __init__(self):
        # List of contractions adapted from Robert MacIntyre's tokenizer.
        # These were in turn collected from the TreebankWordTokenizer in NLTK.
        self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"),
                             regex.compile( \
                                r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")]
        self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"),
                              regex.compile(r"(?i)\b(d)('ye)\b"),
                              regex.compile(r"(?i)\b(gim)(me)\b"),
                              regex.compile(r"(?i)\b(gon)(na)\b"),
                              regex.compile(r"(?i)\b(got)(ta)\b"),
                              regex.compile(r"(?i)\b(lem)(me)\b"),
                              regex.compile(r"(?i)\b(mor)('n)\b"),
                              regex.compile(r"(?i)\b(wan)(na) ")]
        self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"),
                              regex.compile(r"(?i) ('t)(was)\b")]
        self.CONTRACTIONS4 = [regex.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                              regex.compile(r"(?i)\b(wha)(t)(cha)\b")]
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def test_format_replace_unicode_name(self):
        """Test replacing format Unicode names."""

        pattern = regex.compile(r"(some)(.*?)(pattern)(!)")
        expandf = bregex.compile_replace(
            pattern,
            r'{1} \N{Black club suit}\l\N{Greek Capital Letter omega} and '
            r'\LSPAN \N{Greek Capital Letter omega}\E and Escaped \\N{{Greek Capital Letter omega}}\E {3}',
            bregex.FORMAT
        )
        results = expandf(pattern.match('some test pattern!'))

        self.assertEqual(
            'some \u2663\u03c9 and span \u03c9 and Escaped \\N{Greek Capital Letter omega} pattern',
            results
        )
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def test_expand_wrong_values(self):
        """Test `expand` with wrong values."""

        pattern = regex.compile('test')
        replace = bregex.compile_replace(pattern, 'whatever', bregex.FORMAT)
        m = pattern.match('test')

        with pytest.raises(ValueError) as excinfo:
            bregex.expand(m, replace)

        assert "Replace should not be compiled as a format replace!" in str(excinfo.value)

        with pytest.raises(TypeError) as excinfo:
            bregex.expand(m, 0)

        assert "Expected string, buffer, or compiled replace!" in str(excinfo.value)
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def test_expandf_wrong_values(self):
        """Test `expand` with wrong values."""

        pattern = regex.compile('test')
        replace = bregex.compile_replace(pattern, 'whatever')
        m = pattern.match('test')

        with pytest.raises(ValueError) as excinfo:
            bregex.expandf(m, replace)

        assert "Replace not compiled as a format replace" in str(excinfo.value)

        with pytest.raises(TypeError) as excinfo:
            bregex.expandf(m, 0)

        assert "Expected string, buffer, or compiled replace!" in str(excinfo.value)
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def test_octal_fail(self):
        """Test that octal fails properly."""

        pattern = regex.compile(b'Test')

        with pytest.raises(ValueError) as excinfo:
            bregex.compile_replace(pattern, br'\666')

        assert "octal escape value outside of range 0-0o377!" in str(excinfo.value)

        with pytest.raises(ValueError) as excinfo:
            bregex.compile_replace(pattern, br'\C\666\E')

        assert "octal escape value outside of range 0-0o377!" in str(excinfo.value)

        with pytest.raises(ValueError) as excinfo:
            bregex.compile_replace(pattern, br'\c\666')

        assert "octal escape value outside of range 0-0o377!" in str(excinfo.value)
项目:llvm-git-migration    作者:jyknight    | 项目源码 | 文件源码
def __init__(self, manager, file_changes, prefix_sensitive=True):
    self.manager = manager

    self._matchers_prefix_sensitive = False
    self._transforms_prefix_sensitive = prefix_sensitive
    for (path_re, action) in file_changes:
      if not path_re.startswith('.*'):
        self._matchers_prefix_sensitive = True

    self._transforms = [(regex.compile(path_re + '$'), action)
                        for (path_re, action) in file_changes]

    self._stat_tree_cache_hits = 0
    self._stat_wrote_trees = 0
    self._stat_got_trees = 0
    self._stat_transforms = 0
项目:tailchaser    作者:thanos    | 项目源码 | 文件源码
def __init__(self,
                 only_backfill=ONLY_BACKFILL,
                 dont_backfill=DONT_BACKFILL,
                 read_period=READ_PERIOD,
                 clear_checkpoint=CLEAR_CHECKPOINT,
                 read_pause=READ_PAUSE,
                 temp_dir=TMP_DIR,
                 start_of_record_re=None,
                 filter_re=None,
                 windows=None):

        self.config = collections.namedtuple('Args', self.ARGS)
        self.config.dont_backfill = dont_backfill
        self.config.only_backfill = only_backfill
        self.config.clear_checkpoint = clear_checkpoint
        self.config.read_period = read_period
        self.config.read_pause = read_pause
        self.config.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
        self.config.windows = windows if windows is not None else self.is_windows()
        self.config.filter_re = regex.compile(filter_re) if filter_re else None
        if start_of_record_re:
            self.config.start_of_record_re = regex.compile(start_of_record_re)
            self.read_record = self.read_record_with_regex
        self.state = self.STARTING
        self.stats = collections.Counter()
项目:stashpy    作者:afroisalreadyinu    | 项目源码 | 文件源码
def test_nginx_log(self):
        regexp = '<%{INT}>%{SYSLOGTIMESTAMP:syslog_timestamp} %{SYSLOGHOST:host} %{IPORHOST:remote_addr} - %{USERNAME:remote_user}?- \[%{HTTPDATE:time_local}\] \"(?P<method>(GET|PUT|PATCH|POST|DELETE|HEAD|OPTIONS)) %{URIPATH:path}%{URIPARAM:params} HTTP/%{NUMBER:httpversion}\" %{INT:status} %{INT:body_bytes_sent}\"-\" %{QS:http_user_agent}'
        other_log = '<13>Mar 25 12:26:57 myserver.io 62.73.84.230 - - [25/Mar/2016:12:26:57 +0000] "GET /orders?order_identifier=AB075081&consumer_name=&consumer_first_name= HTTP/1.1" 200 1499"-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"'
        grokked_re, types = pattern_matching.grok_re_preprocess(regexp)
        compiled = regex.compile(grokked_re)
        self.assertDictEqual(
            compiled.match(other_log).groupdict(),
            {"syslog_timestamp": "Mar 25 12:26:57",
             "host": 'myserver.io',
             "remote_addr": '62.73.84.230',
             "remote_user": None,
             "time_local": "25/Mar/2016:12:26:57 +0000",
             "method": "GET",
             "path": "/orders",
             "params": "?order_identifier=AB075081&consumer_name=&consumer_first_name=",
             "httpversion": "1.1",
             "status": "200",
             "body_bytes_sent": "1499",
             "http_user_agent": '"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"'
            })
项目:BarcSeek    作者:NCBI-Hackathons    | 项目源码 | 文件源码
def barcode_to_regex(barcode: str, error_rate: Optional[int]=None):
    """Convert a barcode string to a regex pattern
    barcode [str]           The barcode string to turn into a regex
    error_rate [int]=None   The error rate"""
    pattern = '' # type: str
    umi = regex.findall(r'(N+)', barcode, regex.IGNORECASE) # type: List[str]
    umi_lengths = tuple(map(len, umi)) # type: Tuple[int]
    filtered_barcode = filter(None, barcode.upper().split('N')) # type: filter
    for index, subpattern in enumerate(filtered_barcode): # type: int, str
        barcode_pattern = '(' + subpattern + ')' # type: str
        if error_rate:
            barcode_pattern += '{e<=' + str(error_rate) + '}'
        pattern += barcode_pattern
        try:
            umi_pattern = '(' + ''.join(itertools.repeat('[ACGT]', umi_lengths[index])) + ')' # type: str
        except IndexError:
            break
        else:
            if error_rate:
                umi_pattern += '{e<=' + str(error_rate) + '}'
            pattern += umi_pattern
    find_barcode = regex.compile(r'%s' % pattern, regex.ENHANCEMATCH)
    return find_barcode
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def __init__(self, lang=None, XSAMPA=False):
        self.lang = lang
        self.XSAMPA = XSAMPA
        self.api = "https://en.wiktionary.org/w/api.php"
        self.param = {
            "action": "expandtemplates",
            "text": None,
            "prop": "wikitext",
            "format": "json"
        }
        self.regex = {
            "lang": re.compile("\|lang=([^\|]+)"),
            "node": re.compile("(?<brackets>{{(?:[^{}]+|(?&brackets))*}})"),
            "IPA-node": re.compile("^(([\w]+\-)?(IPA|pron))(?=\||\n|\Z)"),
            "h2": re.compile("(?:\A|\n)={2}([\p{L}0-9 -]+)={2}\n"),
            "h3": re.compile("\n={3}([\p{L}0-9 -]+)={3}\n"),
            "h4": re.compile("\n={4}([\p{L}0-9 -]+)={4}\n"),
            "IPA": re.compile("<span[^>]*>([^<]+)<\/span>")
        }
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)
项目:epitran    作者:dmort27    | 项目源码 | 文件源码
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None):
        """Construct a Flite "wrapper"

        Args:
            arpabet (str): file containing ARPAbet to IPA mapping
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
            cedict_filename (str): path to CC-CEDict dictionary (included for
                                   compatibility)
        """
        arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv'))
        self.arpa_map = self._read_arpabet(arpabet)
        self.chunk_re = re.compile(r'(\p{L}+|[^\p{L}]+)', re.U)
        self.puncnorm = PuncNorm()
        self.ligatures = ligatures
        self.ft = panphon.FeatureTable()
项目:epitran    作者:dmort27    | 项目源码 | 文件源码
def _read_cedict(self, cedict_file, traditional=False):
        comment_re = re.compile('\s*#')
        lemma_re = re.compile('(?P<hanzi>[^]]+) \[(?P<pinyin>[^]]+)\] /(?P<english>.+)/')
        cedict = {}
        with codecs.open(cedict_file, 'r', 'utf-8') as f:
            for line in f:
                if comment_re.match(line):
                    pass
                elif lemma_re.match(line):
                    match = lemma_re.match(line)
                    hanzi = match.group('hanzi').split(' ')
                    pinyin = match.group('pinyin').split(' ')
                    english = match.group('english').split('/')
                    if traditional:
                        cedict[hanzi[0]] = (pinyin, english)  # traditional characters only
                    else:
                        cedict[hanzi[1]] = (pinyin, english)  # simplified characters only.
        return cedict
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def compile_regex_from_str(self, pat):
        """Given a string describing features masks for a sequence of segments,
        return a compiled regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """
        s2n = {'-': -1, '0': 0, '+': 1}
        seg_res = []
        for mat in re.findall(r'\[[^]]+\]+', pat):
            ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)}
            segs = self.all_segs_matching_fts(ft_mask)
            seg_res.append('({})'.format('|'.join(segs)))
        regexp = ''.join(seg_res)
        return re.compile(regexp)
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def compile_regex_from_str(self, ft_str):
        """Given a string describing features masks for a sequence of segments,
        return a regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """

        sequence = []
        for m in re.finditer(r'\[([^]]+)\]', ft_str):
            ft_mask = fts(m.group(1))
            segs = self.all_segs_matching_fts(ft_mask)
            sub_pat = '({})'.format('|'.join(segs))
            sequence.append(sub_pat)
        pattern = ''.join(sequence)
        regex = re.compile(pattern)
        return regex
项目:docker-image-py    作者:realityone    | 项目源码 | 文件源码
def match(regexp):
    return regex.compile(regexp)
项目:siamese_sentiment    作者:jcavalieri8619    | 项目源码 | 文件源码
def strip_html_tags(string, verbose=False):
    p = regex.compile(r'<.*?>')
    return p.sub(' ', string)
项目:perceptronix    作者:kylebgorman    | 项目源码 | 文件源码
def __init__(self, candidate_regex, max_context, *args, **kwargs):
    self._candidate_regex = regex.compile(candidate_regex)
    self._max_context = max_context
    self._classifier = perceptronix.SparseBinomialClassifier(*args, **kwargs)
项目:perceptronix    作者:kylebgorman    | 项目源码 | 文件源码
def read(cls, filename, candidate_regex, max_context):
    """Reads sentence tokenizer model from serialized model file."""
    result = cls.__new__(cls)
    result._candidate_regex = regex.compile(candidate_regex)
    result._max_context = max_context
    result._classifier = perceptronix.SparseBinomialClassifier.read(filename)
    return result
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self):
        super(Base64Decode, self).__init__()

        self.whitespace_re = regex.compile(b'[ \t\r\n]+', regex.ASCII)
        self.base64_re = regex.compile(b'^[-+_/A-Za-z0-9=]+$', regex.ASCII)
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, leading=r'[\p{C}\s]+', trailing=r'[\p{C}\s]+'):
        # type: (Text, Text) -> None
        """
        :param leading:
            Regex to match at the start of the string.

        :param trailing:
            Regex to match at the end of the string.
        """
        super(Strip, self).__init__()

        if leading:
            self.leading = regex.compile(
                r'^{pattern}'.format(pattern=leading),
                regex.UNICODE,
            )
        else:
            self.leading = None

        if trailing:
            self.trailing = regex.compile(
                r'{pattern}$'.format(pattern=trailing),
                regex.UNICODE,
            )
        else:
            self.trailing = None
项目:filters    作者:eflglobal    | 项目源码 | 文件源码
def __init__(self, encoding='utf-8', normalize=True):
        # type: (Text, bool) -> None
        """
        :param encoding:
            Used to decode non-unicode values.

        :param normalize:
            Whether to normalize the resulting value:
                - Convert to NFC form.
                - Remove non-printable characters.
                - Convert all line endings to unix-style ('\n').
        """
        super(Unicode, self).__init__()

        self.encoding   = encoding
        self.normalize  = normalize

        if self.normalize:
            #
            # Compile the regex that we will use to remove non-
            # printables from the resulting unicode.
            # http://www.regular-expressions.info/unicode.html#category
            #
            # Note: using a double negative so that we can exclude
            # newlines, which are technically considered control chars.
            # http://stackoverflow.com/a/3469155
            #
            self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_weakref(self):
        s = 'QabbbcR'
        x = regex.compile('ab+c')
        y = proxy(x)
        if x.findall('QabbbcR') != y.findall('QabbbcR'):
            self.fail()
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_1661(self):
        # Verify that flags do not get silently ignored with compiled patterns
        pattern = regex.compile('.')
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.match(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.search(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.findall(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.compile(pattern, regex.I))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_3629(self):
        # A regex that triggered a bug in the sre-code validator
        self.assertEqual(repr(type(regex.compile("(?P<quote>)(?(quote))"))),
          self.PATTERN_CLASS)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_re_match(self):
        self.assertEqual(regex.match('a', 'a')[:], ('a',))
        self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a'))
        self.assertEqual(regex.match(r'(a)', 'a')[0], 'a')
        self.assertEqual(regex.match(r'(a)', 'a')[1], 'a')
        self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a'))

        pat = regex.compile('((a)|(b))(c)?')
        self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None))
        self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None))
        self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c'))
        self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c'))
        self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c'))

        # A single group.
        m = regex.match('(a)', 'a')
        self.assertEqual(m.group(), 'a')
        self.assertEqual(m.group(0), 'a')
        self.assertEqual(m.group(1), 'a')
        self.assertEqual(m.group(1, 1), ('a', 'a'))

        pat = regex.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b',
          None))
        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_getattr(self):
        self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)')
        self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I |
          regex.DEFAULT_VERSION)
        self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U
          | regex.DEFAULT_VERSION)
        self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2)
        self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {})

        self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
          {'first': 1, 'other': 2})

        self.assertEqual(regex.match("(a)", "a").pos, 0)
        self.assertEqual(regex.match("(a)", "a").endpos, 1)

        self.assertEqual(regex.search("b(c)", "abcdef").pos, 0)
        self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6)
        self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3))
        self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3))

        self.assertEqual(regex.match("(a)", "a").string, 'a')
        self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1)))
        self.assertEqual(repr(type(regex.match("(a)", "a").re)),
          self.PATTERN_CLASS)

        # Issue 14260.
        p = regex.compile(r'abc(?P<n>def)')
        p.groupindex["n"] = 0
        self.assertEqual(p.groupindex["n"], 1)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_re_escape(self):
        p = ""
        self.assertEqual(regex.escape(p), p)
        for i in range(0, 256):
            p += chr(i)
            self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))),
              True)
            self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(),
              (0, 1))

        pat = regex.compile(regex.escape(p))
        self.assertEqual(pat.match(p).span(), (0, 256))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_flags(self):
        for flag in [regex.I, regex.M, regex.X, regex.S, regex.L]:
            self.assertEqual(repr(type(regex.compile('^pattern$', flag))),
              self.PATTERN_CLASS)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_545855(self):
        # Bug 545855 -- This pattern failed to cause a compile error as it
        # should, instead provoking a TypeError.
        self.assertRaisesRegex(regex.error, self.BAD_SET, lambda:
          regex.compile('foo[a-'))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_612074(self):
        pat = u"[" + regex.escape(u"\u2039") + u"]"
        self.assertEqual(regex.compile(pat) and 1, 1)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_926075(self):
        if regex.compile('bug_926075') is regex.compile(u'bug_926075'):
            self.fail()
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_931848(self):
        pattern = u"[\u002E\u3002\uFF0E\uFF61]"
        self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b',
          'c'])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_581080(self):
        it = regex.finditer(r"\s", "a b")
        self.assertEqual(it.next().span(), (1, 2))
        self.assertRaises(StopIteration, lambda: it.next())

        scanner = regex.compile(r"\s").scanner("a b")
        self.assertEqual(scanner.search().span(), (1, 2))
        self.assertEqual(scanner.search(), None)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_empty_array(self):
        # SF buf 1647541.
        import array
        for typecode in 'cbBuhHiIlLfd':
            a = array.array(typecode)
            self.assertEqual(regex.compile("bla").match(a), None)
            self.assertEqual(regex.compile("").match(a)[1 : ], ())
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_inline_flags(self):
        # Bug #1700.
        upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below
        lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below

        p = regex.compile(upper_char, regex.I | regex.U)
        self.assertEqual(bool(p.match(lower_char)), True)

        p = regex.compile(lower_char, regex.I | regex.U)
        self.assertEqual(bool(p.match(upper_char)), True)

        p = regex.compile('(?i)' + upper_char, regex.U)
        self.assertEqual(bool(p.match(lower_char)), True)

        p = regex.compile('(?i)' + lower_char, regex.U)
        self.assertEqual(bool(p.match(upper_char)), True)

        p = regex.compile('(?iu)' + upper_char)
        self.assertEqual(bool(p.match(lower_char)), True)

        p = regex.compile('(?iu)' + lower_char)
        self.assertEqual(bool(p.match(upper_char)), True)

        self.assertEqual(bool(regex.match(r"(?i)a", "A")), True)
        self.assertEqual(bool(regex.match(r"a(?i)", "A")), True)
        self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True)
        self.assertEqual(regex.match(r"a(?iV1)", "A"), None)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_ascii_and_unicode_flag(self):
        # Unicode patterns.
        for flags in (0, regex.UNICODE):
            pat = regex.compile(u'\xc0', flags | regex.IGNORECASE)
            self.assertEqual(bool(pat.match(u'\xe0')), True)
            pat = regex.compile(u'\w', flags)
            self.assertEqual(bool(pat.match(u'\xe0')), True)

        pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'\w', regex.ASCII)
        self.assertEqual(pat.match(u'\xe0'), None)
        pat = regex.compile(u'(?a)\w')
        self.assertEqual(pat.match(u'\xe0'), None)

        # String patterns.
        for flags in (0, regex.ASCII):
            pat = regex.compile('\xc0', flags | regex.IGNORECASE)
            self.assertEqual(pat.match('\xe0'), None)
            pat = regex.compile('\w')
            self.assertEqual(pat.match('\xe0'), None)

        self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda:
          regex.compile('(?au)\w'))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_10328 (self):
        # Issue 10328.
        pat = regex.compile(r'(?mV0)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)')
        self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>',
          'foobar '), ('foobar<trailing_ws>', 1))
        self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ',
          ''])
        pat = regex.compile(r'(?mV1)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)')
        self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>',
          'foobar '), ('foobar<trailing_ws><no_final_newline>', 2))
        self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ',
          ''])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_common_prefix(self):
        # Very long common prefix
        all = string.ascii_lowercase + string.digits + string.ascii_uppercase
        side = all * 4
        regexp = '(' + side + '|' + side + ')'
        self.assertEqual(repr(type(regex.compile(regexp))), self.PATTERN_CLASS)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def pretranslate_dict_to_function(self, convert_dict):

        # add uppercase letters
        for letter, translation in list(convert_dict.items()):
            letter_upper = letter.upper()
            if letter_upper != letter and letter_upper not in convert_dict:
                convert_dict[letter_upper] = translation.capitalize()

        self.convert_dict = convert_dict
        PRETRANSLATE = re.compile(u'(\L<options>)', options=convert_dict)

        # translate some letters before translating
        return lambda text: PRETRANSLATE.sub(lambda m: convert_dict[m.group(1)], text)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def calc_unwanted_chars_re(self):
        unwanted_chars_re = u'[^\p{{AlNum}}{safe_chars}]+'.format(safe_chars=re.escape(self._safe_chars or ''))
        self.unwanted_chars_re = re.compile(unwanted_chars_re, re.IGNORECASE)

        if self._stop_words:
            unwanted_chars_and_words_re = unwanted_chars_re + u'|(?<!\p{AlNum})(?:\L<stop_words>)(?!\p{AlNum})'
            self.unwanted_chars_and_words_re = re.compile(unwanted_chars_and_words_re, re.IGNORECASE, stop_words=self._stop_words)
        else:
            self.unwanted_chars_and_words_re = None