Python regex 模块,search() 实例源码

我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用regex.search()

项目:dprr-django    作者:kingsdigitallab    | 项目源码 | 文件源码
def parse_primary_source(text):
    """given a primary source text reference it returns the
    abbrev of the primary source
    """

    psource = ""

    ref_regex = regex.compile(r"""
                (?P<psource>(\w+\.?\s?)+)
                (\(?\d\)\s?)+
                """, regex.VERBOSE)

    res = regex.search(ref_regex, text)

    if res:
        if res.group(psource):
            return res.group(psource)
        else:
            return None
    else:
        return None
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def __exit__(self, exc_type, exc_value, tb):
        if exc_type is None:
            try:
                exc_name = self.expected.__name__
            except AttributeError:
                exc_name = str(self.expected)
            raise self.failureException(
                "%s not raised" % exc_name)
        if not issubclass(exc_type, self.expected):
            # let unexpected exceptions pass through
            return False
        self.exception = exc_value # store for later retrieval
        if self.expected_regexp is None:
            return True

        expected_regexp = self.expected_regexp
        if isinstance(expected_regexp, basestring):
            expected_regexp = re.compile(expected_regexp)
        if not expected_regexp.search(str(exc_value)):
            raise self.failureException('"%s" does not match "%s"' %
                     (expected_regexp.pattern, str(exc_value)))
        return True
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_scoped_and_inline_flags(self):
        # Issues 433028, 433024, 433027.
        self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2))
        self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2))
        self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2))
        self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None)

        self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda:
          regex.search(r"(?V0-i)Ab", "ab", flags=regex.I))

        self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None)
        self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None)
        self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None)
        self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None)
        self.assertEqual(regex.search(r"A(?V1-i)b", "ab",
          flags=regex.I).span(), (0, 2))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_captures(self):
        self.assertEqual(regex.search(r"(\w)+", "abc").captures(1), ['a', 'b',
          'c'])
        self.assertEqual(regex.search(r"(\w{3})+", "abcdef").captures(0, 1),
          (['abcdef'], ['abc', 'def']))
        self.assertEqual(regex.search(r"^(\d{1,3})(?:\.(\d{1,3})){3}$",
          "192.168.0.1").captures(1, 2), (['192', ], ['168', '0', '1']))
        self.assertEqual(regex.match(r"^([0-9A-F]{2}){4} ([a-z]\d){5}$",
          "3FB52A0C a2c4g3k9d3").captures(1, 2), (['3F', 'B5', '2A', '0C'],
          ['a2', 'c4', 'g3', 'k9', 'd3']))
        self.assertEqual(regex.match("([a-z]W)([a-z]X)+([a-z]Y)",
          "aWbXcXdXeXfY").captures(1, 2, 3), (['aW'], ['bX', 'cX', 'dX', 'eX'],
          ['fY']))

        self.assertEqual(regex.search(r".*?(?=(.)+)b", "ab").captures(1),
          ['b'])
        self.assertEqual(regex.search(r".*?(?>(.){0,2})d", "abcd").captures(1),
          ['b', 'c'])
        self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a'])
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def encode(self, names):
        """
        Encode list of names into list of list of character IDs using the character encoder.

        :param names: list of names
        :return: list (each name) of list (each word) of character IDs
        """
        name_id2word_id2char_ids = list()
        for name in names:
            name = self._clean_characters(name)
            word_id2char_ids = list()

            for word in name.split(self._separator):
                word = '{}{}{}'.format(self._start_char, word, self._end_char)
                try:
                    word_id2char_ids.append(self._label_encoder.transform(list(word)).tolist())
                except ValueError as exception:
                    unseen_chars = regex.search(
                        r'y contains new labels: (.*)$', exception.args[0]).groups()[0]
                    raise UnseenCharacterException('Unseen characters: {}'.format(unseen_chars))

            name_id2word_id2char_ids.append(word_id2char_ids)

        return name_id2word_id2char_ids
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def compute_gender_probas(dir_path, start_year):
    year_prefix = 'yob'
    name2gender2count = defaultdict(lambda: defaultdict(int))
    for file_path in glob(os.path.join(dir_path, '*.txt')):
        year = int(regex.search(r'/{}(\d\d\d\d)'.format(year_prefix), file_path).groups()[0])
        if year < start_year:
            continue

        with open(file_path, encoding='utf8') as file_:
            csv_reader = csv.reader(file_)
            for name, gender, count in csv_reader:
                name2gender2count[name][_CLASS_MAP[gender]] += int(count)

    name2proba = dict()
    for name, gender2count in name2gender2count.items():
        name2proba[name] = float(gender2count[POSITIVE_CLASS]) / (gender2count[POSITIVE_CLASS] +
                                                                  gender2count[NEGATIVE_CLASS])
    return name2proba
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def apply(self):
            """Apply search template."""

            i = RegexSearchTokens(self.search, self.verbose)
            iter(i)

            for t in i:
                if len(t) > 1:
                    # handle our stuff

                    c = t[1:]

                    if c[0:1] in self._verbose_tokens:
                        self.extended.append(t)
                    elif c == self._quote:
                        self.extended.extend(self.quoted(i))
                    elif c != self._end:
                        self.extended.append(t)
                elif self.verbose and t == self._hashtag and not self.in_group(i.index - 1):
                    self.extended.append(t)
                    self.extended.extend(self.comments(i))
                else:
                    self.extended.append(t)

            return self._empty.join(self.extended)
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def _apply_search_backrefs(pattern, flags=0):
        """Apply the search backrefs to the search pattern."""

        if isinstance(pattern, (compat.string_type, compat.binary_type)):
            re_verbose = VERBOSE & flags
            if flags & V0:
                re_version = V0
            elif flags & V1:
                re_version = V1
            else:
                re_version = 0
            pattern = RegexSearchTemplate(pattern, re_verbose, re_version).apply()
        elif isinstance(pattern, REGEX_TYPE):
            if flags:
                raise ValueError("Cannot process flags argument with a compiled pattern!")
        else:
            raise TypeError("Not a string or compiled pattern!")
        return pattern
项目:epitran    作者:dmort27    | 项目源码 | 文件源码
def _read_rule(self, i, line):
        line = line.strip()
        if line:
            line = unicodedata.normalize('NFC', unicodedata.normalize('NFD', line))
            s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line)
            if s:
                self.symbols[s.group('symbol')] = s.group('value')
            else:
                line = self._sub_symbols(line)
                r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line)
                try:
                    a, b, X, Y = r.groups()
                except AttributeError:
                    raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line))
                X, Y = X.replace('#', '^'), Y.replace('#', '$')
                a, b = a.replace('0', ''), b.replace('0', '')
                try:
                    if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a):
                        return self._fields_to_function_metathesis(a, X, Y)
                    else:
                        return self._fields_to_function(a, b, X, Y)
                except Exception as e:
                    raise DatafileError('Line {}: "{}" cannot be compiled as regex: ?{}'.format(i + 1, line, e))
项目:joyodb    作者:leoboiko    | 项目源码 | 文件源码
def is_ichidan_verb(kanji, canonical_reading):
    """

    >>> is_ichidan_verb('?', '???')
    True

    >>> is_ichidan_verb('?', '??')
    False

    >>> is_ichidan_verb('?', '??')
    True

    >>> is_ichidan_verb('?', '??')
    False
    """

    if kanji in ICHIDAN_EXCEPTIONS:
        return False
    elif re.search(ICHIDAN_BASE_ENDING + '?$', canonical_reading):
        return True
    else:
        return False
项目:pyq    作者:caioariede    | 项目源码 | 文件源码
def parse(cls, string):
        selectors = []

        combinator = None
        prev_selector = None

        while True:
            match = regex.search(cls.RE.comma, string)
            if match:
                # skip comma
                _, pos = match.span()
                string = string[pos:]
                continue

            match = regex.search(cls.RE.combinator, string)
            if match:
                _, pos = match.span()
                combinator = string[:pos].strip()
                string = string[pos:]
            else:
                combinator = None

            match = regex.search(cls.RE.selector, string)
            if match:
                _, pos = match.span()
                seltext = string[:pos]
                string = string[pos:]
                selector = cls(seltext, combinator=combinator)
                if combinator is not None and prev_selector:
                    prev_selector.next_selector = prev_selector = selector
                else:
                    prev_selector = selector
                    selectors.append(selector)
                continue

            break

        return selectors
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_search_star_plus(self):
        self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0))
        self.assertEqual(regex.search('x*', 'axx').span(), (0, 0))
        self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3))
        self.assertEqual(regex.search('x+', 'axx').span(), (1, 3))
        self.assertEqual(regex.search('x', 'aaa'), None)
        self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0))
        self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0))
        self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3))
        self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3))
        self.assertEqual(regex.match('a+', 'xxx'), None)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_1661(self):
        # Verify that flags do not get silently ignored with compiled patterns
        pattern = regex.compile('.')
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.match(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.search(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.findall(pattern, 'A', regex.I))
        self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT,
          lambda: regex.compile(pattern, regex.I))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_14462(self):
        # chr(255) is not a valid identifier in Python 2.
        group_name = u'\xFF'
        self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda:
          regex.search(ur'(?P<' + group_name + '>a)', u'a'))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_getattr(self):
        self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)')
        self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I |
          regex.DEFAULT_VERSION)
        self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U
          | regex.DEFAULT_VERSION)
        self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2)
        self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {})

        self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
          {'first': 1, 'other': 2})

        self.assertEqual(regex.match("(a)", "a").pos, 0)
        self.assertEqual(regex.match("(a)", "a").endpos, 1)

        self.assertEqual(regex.search("b(c)", "abcdef").pos, 0)
        self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6)
        self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3))
        self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3))

        self.assertEqual(regex.match("(a)", "a").string, 'a')
        self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1)))
        self.assertEqual(repr(type(regex.match("(a)", "a").re)),
          self.PATTERN_CLASS)

        # Issue 14260.
        p = regex.compile(r'abc(?P<n>def)')
        p.groupindex["n"] = 0
        self.assertEqual(p.groupindex["n"], 1)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_not_literal(self):
        self.assertEqual(regex.search(r"\s([^a])", " b")[1], 'b')
        self.assertEqual(regex.search(r"\s([^a]*)", " bb")[1], 'bb')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_search_coverage(self):
        self.assertEqual(regex.search(r"\s(b)", " b")[1], 'b')
        self.assertEqual(regex.search(r"a\s", "a ")[0], 'a ')
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_418626(self):
        # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code
        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
        # pattern '*?' on a long string.
        self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0),
          20001)
        self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' +
          'cde').end(0), 20003)
        self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0),
          60001)
        # Non-simple '*?' still used to hit the recursion limit, before the
        # non-recursive scheme was implemented.
        self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0),
          20001)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_581080(self):
        it = regex.finditer(r"\s", "a b")
        self.assertEqual(it.next().span(), (1, 2))
        self.assertRaises(StopIteration, lambda: it.next())

        scanner = regex.compile(r"\s").scanner("a b")
        self.assertEqual(scanner.search().span(), (1, 2))
        self.assertEqual(scanner.search(), None)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_atomic(self):
        # Issue 433030.
        self.assertEqual(regex.search(r"(?>a*)a", "aa"), None)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_repeated_repeats(self):
        # Issue 2537.
        self.assertEqual(regex.search(r"(?:a+)+", "aaa").span(), (0, 3))
        self.assertEqual(regex.search(r"(?:(?:ab)+c)+", "abcabc").span(), (0,
          6))
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_line_boundary(self):
        self.assertEqual(regex.findall(r".+", "Line 1\nLine 2\n"), ["Line 1",
          "Line 2"])
        self.assertEqual(regex.findall(r".+", "Line 1\rLine 2\r"),
          ["Line 1\rLine 2\r"])
        self.assertEqual(regex.findall(r".+", "Line 1\r\nLine 2\r\n"),
          ["Line 1\r", "Line 2\r"])
        self.assertEqual(regex.findall(r"(?w).+", "Line 1\nLine 2\n"),
          ["Line 1", "Line 2"])
        self.assertEqual(regex.findall(r"(?w).+", "Line 1\rLine 2\r"),
          ["Line 1", "Line 2"])
        self.assertEqual(regex.findall(r"(?w).+", "Line 1\r\nLine 2\r\n"),
          ["Line 1", "Line 2"])

        self.assertEqual(regex.search(r"^abc", "abc").start(), 0)
        self.assertEqual(regex.search(r"^abc", "\nabc"), None)
        self.assertEqual(regex.search(r"^abc", "\rabc"), None)
        self.assertEqual(regex.search(r"(?w)^abc", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?w)^abc", "\nabc"), None)
        self.assertEqual(regex.search(r"(?w)^abc", "\rabc"), None)

        self.assertEqual(regex.search(r"abc$", "abc").start(), 0)
        self.assertEqual(regex.search(r"abc$", "abc\n").start(), 0)
        self.assertEqual(regex.search(r"abc$", "abc\r"), None)
        self.assertEqual(regex.search(r"(?w)abc$", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?w)abc$", "abc\n").start(), 0)
        self.assertEqual(regex.search(r"(?w)abc$", "abc\r").start(), 0)

        self.assertEqual(regex.search(r"(?m)^abc", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?m)^abc", "\nabc").start(), 1)
        self.assertEqual(regex.search(r"(?m)^abc", "\rabc"), None)
        self.assertEqual(regex.search(r"(?mw)^abc", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?mw)^abc", "\nabc").start(), 1)
        self.assertEqual(regex.search(r"(?mw)^abc", "\rabc").start(), 1)

        self.assertEqual(regex.search(r"(?m)abc$", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?m)abc$", "abc\n").start(), 0)
        self.assertEqual(regex.search(r"(?m)abc$", "abc\r"), None)
        self.assertEqual(regex.search(r"(?mw)abc$", "abc").start(), 0)
        self.assertEqual(regex.search(r"(?mw)abc$", "abc\n").start(), 0)
        self.assertEqual(regex.search(r"(?mw)abc$", "abc\r").start(), 0)
项目:chicksexer    作者:kensk8er    | 项目源码 | 文件源码
def _filter(names, predictions, return_proba):
    """Filter bad results."""
    neutral_pred = {POSITIVE_CLASS: 0.5, NEGATIVE_CLASS: 0.5} if return_proba else NEUTRAL_CLASS

    for name_id, name in enumerate(names):
        if not regex.search(r'\w', name):
            predictions[name_id] = copy(neutral_pred)

    return predictions
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def compile_search(pattern, flags=0, **kwargs):
        """Compile with extended search references."""

        return regex.compile(_apply_search_backrefs(pattern, flags), flags, **kwargs)
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def search(pattern, string, flags=0, pos=None, endpos=None, partial=False, concurrent=None, **kwargs):
        """Wrapper for `search`."""

        return regex.search(
            _apply_search_backrefs(pattern, flags), string,
            flags, pos, endpos, partial, concurrent, **kwargs
        )
项目:BarcSeek    作者:NCBI-Hackathons    | 项目源码 | 文件源码
def expand_iupac(barcode):
    '''
    Expand IUPAC codes, i.e. turn 'AY' to ['AC', 'AT'], removes 'N's
    '''
    barcode = barcode.upper()
    if all((i in 'ACGTN' for i in set(barcode))):
        return barcode.replace('N','')
    else:
        pos = regex.search(r'[%s]' % ''.join(IUPAC_CODES.keys()), barcode).start()
        code = barcode[pos]
        return (expand_iupac(barcode.replace(code, i, 1)) for i in IUPAC_CODES[code])
项目:gcdt    作者:glomex    | 项目源码 | 文件源码
def test_list_cmd(awsclient, capsys):
    tooldata = get_tooldata(
        awsclient, 'kumo', 'list',
        config_base_name='gcdt_large',
        location=here('./resources/simple_cloudformation_stack/'))
    list_cmd(**tooldata)
    out, err = capsys.readouterr()
    # using regular expression search in captured output
    assert regex.search('listed \d+ stacks', out) is not None
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def tone_determ(text):
    text = unicodedata.normalize("NFD", text)
    match = re.search(tones, text)
    if match and match.group() in pinyin_tone.keys():
        return pinyin_tone[match.group()]
    return "5"
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def pinyin_transform(text):
    if re.search("?", text):
        return ""
    text = re.sub(
        unicodedata.normalize("NFD", "ü"),
        "ü",
        re.sub(
            unicodedata.normalize("NFD", "ê"),
            "ê",
            unicodedata.normalize("NFD", text)
        )
    )
    if re.search(
            "[aeiouêü]" + tones + "[aeiou]?[aeiouêü]" + tones + "",
            text.lower()):
        return ""
    text = text.lower()
    if not re.search(tones, text) and re.match("[1-5]", text):
        return re.sub("(\d)(\p{Ll})", "\1 \2", text)
    if re.search("[??,.?]", text):
        text = re.sub(
            "([??])$",
            lambda x: " y?" if x.group() == "?" else " bù",
            text
        )
        text = re.sub("([??])", r" \1 ", text)
        text = re.sub("([,.?])", r" \1 ", text)
        text = re.sub(" +", " ", text)
        text = re.sub("^ ", "", text)
        text = re.sub(" $", "", text)
        text = re.sub("\. \. \.", "...", text)
    text = re.sub("['\-]", " ", text)
    text = re.sub(
        "([aeiouêü]" + tones + "?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)",
        r"\1 \2",
        text
    )
    text = re.sub(" ([grn])$", r"\1", text)
    text = re.sub(" ([grn]) ", r"\1 ", text)

    return unicodedata.normalize("NFC", text)
项目:wikt2pron    作者:abuccts    | 项目源码 | 文件源码
def is_monosyllabic(word):
    return not re.search("[" + vowels + "].*[" + vowels + "]", word)

# Apply transformations to the Cyrillic to more closely match pronunciation.
# Return two arguments: the "original" text (after decomposing composed
# grave characters), and the transformed text. If the two are different,
# {{ru-IPA}} should display a "phonetic respelling" notation. 
# NOADJ disables special-casing for adjectives in -??, while FORCEADJ forces
# special-casing for adjectives, including those in -??? (pre-reform spelling)
# and disables checking for exceptions (e.g. ?????, ???). NOSHTO disables
# special-casing for ??? and related words.
项目:epitran    作者:dmort27    | 项目源码 | 文件源码
def _sub_symbols(self, line):
        while re.search(r'::\w+::', line):
            s = re.search(r'::\w+::', line).group(0)
            if s in self.symbols:
                line = line.replace(s, self.symbols[s])
            else:
                raise RuleFileError('Undefined symbol: {}'.format(s))
        return line
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path):
    '''converts a streamcorpus.Chunk file into the structure that is
    passed by the search engine to find_soft_selectors

    '''
    ch = clean_html(clean_html.default_config)
    cv = clean_visible(clean_visible.default_config)
    ids_and_clean_visible = []
    for si in streamcorpus.Chunk(path=corpus_path):
        if not si.body.clean_visible:
            ## attempt to make clean_visible
            if not si.body.raw:
                logger.critical('no raw content, so skipping: %r', si.abs_url)
                continue
            abs_url = si.abs_url
            si = ch(si, {})
            if not si:
                logger.critical(
                    'failed to make clean_html, so skipping: %r', abs_url)
                continue
            si = cv(si, {})
            if not si or not si.body.clean_visible:
                logger.critical(
                    'failed to make clean_visible, so skipping: %r', abs_url)
                continue
        rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {})
        ids_and_clean_visible.append(rec)
    return ids_and_clean_visible
项目:poirot    作者:emanuelfeld    | 项目源码 | 文件源码
def parse_arguments(args):
    query = ArgumentParser(prog="poirot", description="""Poirot: Mind Your Language""")
    query.add_argument("--url", "-u", dest="url", default="", action="store",
                       help="""The repository's git URL, e.g.
                               'https://github.com/dcgov/poirot.git'.""")
    query.add_argument("--dir", "-d", dest="dir", default=os.getcwd(),
                       help="""The path to the local directory where the
                               git repo is located or should be stored;
                               defaults to the current directory.""")
    query.add_argument("--term", "-t", dest="term", required=False, action="store",
                       help="""A single string or regular expression to search for.""")
    query.add_argument("--patterns", "-p", dest="patterns", action="store",
                       help="""The path to the local file(s) containing strings
                               or regular expressions to match against, each
                               on a new line. Accepts a comma-separated list
                               of file paths.""")
    query.add_argument("--output", "-o", dest="output", required=False,
                       help="""Output results as JSON to FILE.""")
    query.add_argument("--revlist", "-rl", dest="revlist", required=False, default="HEAD^!",
                       help="""A comma-delimited list of revision (commit)
                               ranges to search. Defaults to HEAD^!. Specify
                               'all' to search the entire revision history.""")
    query.add_argument("--before", "-b", dest="before", required=False,
                       help="""Search commits prior to a given date, e.g., Dec-12-2015""")
    query.add_argument("--after", "-a", dest="after", required=False,
                       help="""Search commits after a given date, e.g., Jan-01-2015""")
    query.add_argument("--author", "-au", dest="author", required=False,
                       help="""Restrict to commits made by an AUTHOR. An email
                               address is fine.""")
    query.add_argument("--staged", "-st", dest="staged", action="store_true",
                       help="""Flag to search staged modifications, instead of
                               already committed ones.""")
    query.add_argument("--verbose", "-v", dest="verbose", action="store_true",
                       help="""Flag to output colorful, verbose results.""")

    parsed_args = query.parse_args(args)
    formatted_args = format_arguments(parsed_args)

    return formatted_args
项目:poirot    作者:emanuelfeld    | 项目源码 | 文件源码
def parse_patterns(path):
    """
    Reads in patterns from pattern file at path
    """

    result = {}
    try:
        if regex.search(r"^http[s]://", path):
            response = requests.get(path)
            if response.status_code == 200:
                lines = response.text.split("\n")
            else:
                sys.exit(1)
        else:
            with open(path) as infile:
                lines = infile.readlines()
        label = None
        for line in lines:
            line = str(line).strip()
            if line.startswith("#"):
                label = line.lstrip("# ")
            elif not line:
                label = ""
            else:
                result[line] = label
    except:
        out = """Pattern file {file} does not exist.\n
                 Specify the correct path with --patterns""".format(file=path)
        print(style(out, "red"))
    return result
项目:joyodb    作者:leoboiko    | 项目源码 | 文件源码
def test_okurigana_delimit(self):
        """Simple test to look for suspicious non-delimited readings."""

        for k in joyodb.loaded_data.kanjis:
            for r in filter(lambda r: r.kind == 'Kun', k.readings):
                examples = [e.example for e in r.examples]
                for e in examples:
                    match = re.search(k.kanji + "(\p{Hiragana}+)", e)
                    if match and re.search(match[1] + '$', r.reading):
                        self.assertIn('.', r.reading)
项目:pyq    作者:caioariede    | 项目源码 | 文件源码
def __init__(self, name, combinator=None):
        self.name = name
        self.combinator = combinator
        self.next_selector = None

        selector_patterns = {
            'types': self.RE.type_selector,
            'ids': self.RE.id_selector,
            'classes': self.RE.class_selector,
            'pseudos': self.RE.pseudo_selector,
            'attrs': self.RE.attr_selector,
        }

        matches = {}

        while True:
            pattern_matched = False
            for key, pattern in selector_patterns.items():
                match = regex.search(r'^{}'.format(pattern), name)
                if match:
                    i, pos = match.span()
                    if key not in matches:
                        matches[key] = []
                    matches[key].append(match.groups())
                    name = name[pos:]
                    pattern_matched = True
            if not pattern_matched:
                break

        self.typ = None
        for types in matches.pop('types', []):
            self.typ = types[0]

        self.id_ = None
        for ids in matches.pop('ids', []):
            self.id_ = ids[0]

        self.classes = [a[0] for a in matches.pop('classes', [])]

        self.attrs = [
            Attr(l, o, r.strip())
            for l, o, r in matches.pop('attrs', [])
        ]
        self.pseudos = [
            Pseudo(*a[1:])
            for a in matches.pop('pseudos', [])
        ]
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_lookbehind(self):
        self.assertEqual(regex.search(r"123(?<=a\d+)", "a123").span(), (1, 4))
        self.assertEqual(regex.search(r"123(?<=a\d+)", "b123"), None)
        self.assertEqual(regex.search(r"123(?<!a\d+)", "a123"), None)
        self.assertEqual(regex.search(r"123(?<!a\d+)", "b123").span(), (1, 4))

        self.assertEqual(bool(regex.match("(a)b(?<=b)(c)", "abc")), True)
        self.assertEqual(regex.match("(a)b(?<=c)(c)", "abc"), None)
        self.assertEqual(bool(regex.match("(a)b(?=c)(c)", "abc")), True)
        self.assertEqual(regex.match("(a)b(?=b)(c)", "abc"), None)

        self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)x|c))c", "abc"),
          None)
        self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)b|x))c", "abc"),
          None)
        self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(2)x|b))c",
          "abc")), True)
        self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(1)c|x))c", "abc"),
          None)
        self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(1)b|x))c",
          "abc")), True)

        self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c",
          "abc")), True)
        self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(2)c|x))c", "abc"),
          None)
        self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c",
          "abc")), True)
        self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(1)b|x))c", "abc"),
          None)
        self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(1)c|x))c",
          "abc")), True)

        self.assertEqual(regex.match("(a)b(?<=(?(2)x|c))(c)", "abc"), None)
        self.assertEqual(regex.match("(a)b(?<=(?(2)b|x))(c)", "abc"), None)
        self.assertEqual(regex.match("(a)b(?<=(?(1)c|x))(c)", "abc"), None)
        self.assertEqual(bool(regex.match("(a)b(?<=(?(1)b|x))(c)", "abc")),
          True)

        self.assertEqual(bool(regex.match("(a)b(?=(?(2)x|c))(c)", "abc")),
          True)
        self.assertEqual(regex.match("(a)b(?=(?(2)b|x))(c)", "abc"), None)
        self.assertEqual(bool(regex.match("(a)b(?=(?(1)c|x))(c)", "abc")),
          True)

        self.assertEqual(repr(type(regex.compile(r"(a)\2(b)"))),
          self.PATTERN_CLASS)
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_named_lists(self):
        options = [u"one", u"two", u"three"]
        self.assertEqual(regex.match(ur"333\L<bar>444", u"333one444",
          bar=options).group(), u"333one444")
        self.assertEqual(regex.match(ur"(?i)333\L<bar>444", u"333TWO444",
          bar=options).group(), u"333TWO444")
        self.assertEqual(regex.match(ur"333\L<bar>444", u"333four444",
          bar=options), None)

        options = ["one", "two", "three"]
        self.assertEqual(regex.match(r"333\L<bar>444", "333one444",
          bar=options).group(), "333one444")
        self.assertEqual(regex.match(r"(?i)333\L<bar>444", "333TWO444",
          bar=options).group(), "333TWO444")
        self.assertEqual(regex.match(r"333\L<bar>444", "333four444",
          bar=options), None)

        self.assertEqual(repr(type(regex.compile(r"3\L<bar>4\L<bar>+5",
          bar=["one", "two", "three"]))), self.PATTERN_CLASS)

        self.assertEqual(regex.findall(r"^\L<options>", "solid QWERT",
          options=set(['good', 'brilliant', '+s\\ol[i}d'])), [])
        self.assertEqual(regex.findall(r"^\L<options>", "+solid QWERT",
          options=set(['good', 'brilliant', '+solid'])), ['+solid'])

        options = [u"STRASSE"]
        self.assertEqual(regex.match(ur"(?fiu)\L<words>",
          u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0,
          6))

        options = [u"STRASSE", u"stress"]
        self.assertEqual(regex.match(ur"(?fiu)\L<words>",
          u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0,
          6))

        options = [u"stra\N{LATIN SMALL LETTER SHARP S}e"]
        self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"STRASSE",
          words=options).span(), (0, 7))

        options = ["kit"]
        self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SKITS",
          words=options).span(), (1, 4))
        self.assertEqual(regex.search(ur"(?iu)\L<words>",
          u"SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS",
          words=options).span(), (1, 4))

        self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b",
          u" stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15))
        self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b",
          u" STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15))

        self.assertEqual(regex.search(r"^\L<options>$", "", options=[]).span(),
          (0, 0))
项目:turboparser-semafor    作者:ReutersMedia    | 项目源码 | 文件源码
def split_if_contraction(self, word):
        # Handle preposition+determiner contractions.
        word = regex.sub(ur'^(A|a)l$', ur'a el', word)
        word = regex.sub(ur'^(D|d)el$', ur'de el', word)

        # Before looking at clitic regexes, check if the word is in a blacklist.
        if word in self.non_contractions:
            return word

        # Before looking at clitic regexes, check if the word is in a whitelist.
        if word in self.contractions:
            return ' '.join(self.contractions[word])

        # Right now excludes capitalized words. Might fail if the word is in the
        # beginning of the sentences, but avoids catching a lot of proper nouns,
        # such as "Charles", "Bonaparte", etc.
        if regex.search(ur'^[^\p{IsLower}]', word) is not None:
            return word

        # Handle clitics.
        word = regex.sub( \
            ur'(ar|ir|ír)(me|te|se|nos|le|lo|la|les|los|las)$', \
            ur'\1 \2', word)
        word = regex.sub( \
            ur'(er)(se|le|lo|la|les|los|las)$', \
            ur'\1 \2', word)
        word = regex.sub( \
            ur'á(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \
            ur'a\1 \2', word)
        word = regex.sub( \
            ur'é(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \
                          ur'e\1 \2', word)
        word = regex.sub(ur'í(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \
                         ur'i\1 \2', word)
        word = regex.sub(ur'á(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \
                         ur'a\1 \2 \3', word)
        word = regex.sub(ur'é(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \
                         ur'e\1 \2 \3', word)
        word = regex.sub(ur'í(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \
                         ur'i\1 \2 \3', word)
        word = regex.sub(ur'á(r)(os)(le|lo|la|les|los|las)$', \
                         ur'a\1 \2 \3', word)
        word = regex.sub(ur'é(r)(os)(le|lo|la|les|los|las)$', \
                         ur'e\1 \2 \3', word)
        word = regex.sub(ur'í(r)(os)(le|lo|la|les|los|las)$', \
                         ur'i\1 \2 \3', word)

        # In AnCora, all contractions have two words only.
        word = ' '.join(word.split(' ')[:2])
        return word
项目:SoMaJo    作者:tsproisl    | 项目源码 | 文件源码
def check_spaces(self, tokens, original_text):
        """Compare the tokens with the original text to see which tokens had
        trailing whitespace (to be able to annotate SpaceAfter=No) and
        which tokens contained internal whitespace (to be able to
        annotate OriginalSpelling="...").

        """
        extra_info = ["" for _ in tokens]
        normalized = self.spaces.sub(" ", original_text)
        normalized = self.junk_between_spaces.sub(" ", normalized)
        normalized = normalized.strip()
        for token_index, t in enumerate(tokens):
            original_spelling = None
            token = t.token
            token_length = len(token)
            if normalized.startswith(token):
                normalized = normalized[token_length:]
            else:
                orig = []
                for char in token:
                    first_char = None
                    while first_char != char:
                        try:
                            first_char = normalized[0]
                            normalized = normalized[1:]
                            orig.append(first_char)
                        except IndexError:
                            warnings.warn("IndexError in this paragraph: '%s'\nTokens: %s" % (original_text, tokens))
                original_spelling = "".join(orig)
            m = self.starts_with_junk.search(normalized)
            if m:
                if original_spelling is None:
                    original_spelling = token
                original_spelling += normalized[:m.end()]
                normalized = normalized[m.end():]
            if original_spelling is not None:
                extra_info[token_index] = 'OriginalSpelling="%s"' % original_spelling
            if len(normalized) > 0:
                if normalized.startswith(" "):
                    normalized = normalized[1:]
                else:
                    if len(extra_info[token_index]) > 0:
                        extra_info[token_index] = ", " + extra_info[token_index]
                    extra_info[token_index] = "SpaceAfter=No" + extra_info[token_index]
        try:
            assert len(normalized) == 0
        except AssertionError:
            warnings.warn("AssertionError in this paragraph: '%s'\nTokens: %s\nRemaining normalized text: '%s'" % (original_text, tokens, normalized))
        return extra_info
项目:SoMaJo    作者:tsproisl    | 项目源码 | 文件源码
def check_spaces(self, tokens, original_text):
        """Compare the tokens with the original text to see which tokens had
        trailing whitespace (to be able to annotate SpaceAfter=No) and
        which tokens contained internal whitespace (to be able to
        annotate OriginalSpelling="...").

        """
        extra_info = ["" for _ in tokens]
        normalized = self.spaces.sub(" ", original_text)
        normalized = self.junk_between_spaces.sub(" ", normalized)
        normalized = normalized.strip()
        for token_index, t in enumerate(tokens):
            original_spelling = None
            token = t.token
            token_length = len(token)
            if normalized.startswith(token):
                normalized = normalized[token_length:]
            else:
                orig = []
                for char in token:
                    first_char = None
                    while first_char != char:
                        try:
                            first_char = normalized[0]
                            normalized = normalized[1:]
                            orig.append(first_char)
                        except IndexError:
                            warnings.warn("IndexError in this paragraph: '%s'\nTokens: %s" % (original_text, tokens))
                original_spelling = "".join(orig)
            m = self.starts_with_junk.search(normalized)
            if m:
                if original_spelling is None:
                    original_spelling = token
                original_spelling += normalized[:m.end()]
                normalized = normalized[m.end():]
            if original_spelling is not None:
                extra_info[token_index] = 'OriginalSpelling="%s"' % original_spelling
            if len(normalized) > 0:
                if normalized.startswith(" "):
                    normalized = normalized[1:]
                else:
                    if len(extra_info[token_index]) > 0:
                        extra_info[token_index] = ", " + extra_info[token_index]
                    extra_info[token_index] = "SpaceAfter=No" + extra_info[token_index]
        try:
            assert len(normalized) == 0
        except AssertionError:
            warnings.warn("AssertionError in this paragraph: '%s'\nTokens: %s\nRemaining normalized text: '%s'" % (original_text, tokens, normalized))
        return extra_info
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def __init__(self, search, re_verbose=False, re_version=0):
            """Initialize."""

            if isinstance(search, compat.binary_type):
                self.binary = True
                tokens = btokens
                ctokens = ctok.btokens
            else:
                self.binary = False
                tokens = utokens
                ctokens = ctok.utokens

            self._verbose_flag = ctokens["verbose_flag"]
            self._empty = ctokens["empty"]
            self._b_slash = ctokens["b_slash"]
            self._ls_bracket = ctokens["ls_bracket"]
            self._rs_bracket = ctokens["rs_bracket"]
            self._esc_end = ctokens["esc_end"]
            self._end = ctokens["end"]
            self._quote = ctokens["quote"]
            self._negate = ctokens["negate"]
            self._regex_flags = tokens["regex_flags"]
            self._nl = ctokens["nl"]
            self._hashtag = ctokens["hashtag"]
            self._V0 = tokens["v0"]
            self._V1 = tokens["v1"]
            self.search = search
            if regex.DEFAULT_VERSION == V0:
                self.groups, quotes = self.find_char_groups_v0(search)
            else:  # pragma: no cover
                self.groups, quotes = self.find_char_groups_v1(search)
            self.verbose, self.version = self.find_flags(search, quotes, re_verbose, re_version)
            if self.version != regex.DEFAULT_VERSION:
                if self.version == V0:  # pragma: no cover
                    self.groups = self.find_char_groups_v0(search)[0]
                else:
                    self.groups = self.find_char_groups_v1(search)[0]
            if self.verbose:
                self._verbose_tokens = ctokens["verbose_tokens"]
            else:
                self._verbose_tokens = tuple()
            self.extended = []
项目:sceneTransitionNetMovieClassification    作者:daltonsi    | 项目源码 | 文件源码
def write_script(script, movie):
    script = script.split('\n')
    scenes = []
    characters = OrderedDict()
    bool = False
    first_scene = True
    scene_count = 1
    with open('./output/' + movie + '_results.txt', 'w') as f:
        final = []
        for line in script:
            result = re.search(r'((?:EXT|INT).+)', line)
            if result:
                if first_scene:
                    first_scene = False
                else:
                    if characters[scene]:
                        final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + str(characters[scene]))
                        scene_count += 1
                    else:
                        final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + 'None')
                        scene_count += 1
                bool = True
                scene = re.sub(r'\s+\d+', '', result.group(1))
                scene = re.sub(r'\r', '', scene)
                scenes.append(scene)
                characters[scene] = []

            elif bool:
                result2 = re.search('^\s+([A-Z]{2}.+)(?<![a-z]+)', line)
                if result2:
                    if '!' not in result2.group(1) and ',' not in result2.group(1) and ' ...' not in result2.group(1) \
                            and ' - ' not in result2.group(1) and ':' not in result2.group(1) and len(result2.group(1)) < 25 \
                            and 'FADE' not in result2.group(1) and 'THE END' not in result2.group(1):
                        character = re.sub(r'^\s+', '', result2.group(1))
                        character = re.sub(r'\r', '', character)
                        characters[scene].append(character)
        if characters[scene]:
            final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + str(characters[scene]))
            scene_count += 1
        else:
            final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + 'None')
            scene_count += 1
        for line in final:
            f.write(line + '\n')