Python regex 模块,split() 实例源码

我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用regex.split()

项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_word_boundary(self):
        text = u'The quick ("brown") fox can\'t jump 32.3 feet, right?'
        self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ',
          u'quick', u' ("', u'brown', u'") ', u'fox', u' ', u'can', u"'", u't',
          u' ', u'jump', u' ', u'32', u'.', u'3', u' ', u'feet', u', ',
          u'right', u'?'])
        self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ',
          u'quick', u' ', u'(', u'"', u'brown', u'"', u')', u' ', u'fox', u' ',
          u"can't", u' ', u'jump', u' ', u'32.3', u' ', u'feet', u',', u' ',
          u'right', u'?', u''])

        text = u"The  fox"
        self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u'  ',
          u'fox', u''])
        self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ',
          u' ', u'fox', u''])

        text = u"can't aujourd'hui l'objectif"
        self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'can', u"'",
          u't', u' ', u'aujourd', u"'", u'hui', u' ', u'l', u"'", u'objectif',
          u''])
        self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u"can't", u' ',
          u"aujourd'hui", u' ', u"l'", u'objectif', u''])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_turkic(self):
        # Turkish has dotted and dotless I/i.
        pairs = u"I=i;I=\u0131;i=\u0130"

        all_chars = set()
        matching = set()
        for pair in pairs.split(";"):
            ch1, ch2 = pair.split("=")
            all_chars.update((ch1, ch2))
            matching.add((ch1, ch1))
            matching.add((ch1, ch2))
            matching.add((ch2, ch1))
            matching.add((ch2, ch2))

        for ch1 in all_chars:
            for ch2 in all_chars:
                m = regex.match(ur"(?iu)\A" + ch1 + ur"\Z", ch2)
                if m:
                    if (ch1, ch2) not in matching:
                        self.fail("%s matching %s" % (repr(ch1), repr(ch2)))
                else:
                    if (ch1, ch2) in matching:
                        self.fail("%s not matching %s" % (repr(ch1),
                          repr(ch2)))
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def search_docs(inputs, max_ex=5, opts=None):
    """Given a set of document ids (returned by ranking for a question), search
    for top N best matching (by heuristic) paragraphs that contain the answer.
    """
    if not opts:
        raise RuntimeError('Options dict must be supplied.')

    doc_ids, q_tokens, answer = inputs
    examples = []
    for i, doc_id in enumerate(doc_ids):
        for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))):
            found = find_answer(paragraph, q_tokens, answer, opts)
            if found:
                # Reverse ranking, giving priority to early docs + paragraphs
                score = (found[0], -i, -j, random.random())
                if len(examples) < max_ex:
                    heapq.heappush(examples, (score, found[1]))
                else:
                    heapq.heappushpop(examples, (score, found[1]))
    return [e[1] for e in examples]
项目:DrQA    作者:facebookresearch    | 项目源码 | 文件源码
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH:
                yield ' '.join(curr)
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            yield ' '.join(curr)
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def search_docs(inputs, max_ex=5, opts=None):
    """Given a set of document ids (returned by ranking for a question), search
    for top N best matching (by heuristic) paragraphs that contain the answer.
    """
    if not opts:
        raise RuntimeError('Options dict must be supplied.')

    doc_ids, q_tokens, answer = inputs
    examples = []
    for i, doc_id in enumerate(doc_ids):
        for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))):
            found = find_answer(paragraph, q_tokens, answer, opts)
            if found:
                # Reverse ranking, giving priority to early docs + paragraphs
                score = (found[0], -i, -j, random.random())
                if len(examples) < max_ex:
                    heapq.heappush(examples, (score, found[1]))
                else:
                    heapq.heappushpop(examples, (score, found[1]))
    return [e[1] for e in examples]
项目:DrQA_cn    作者:AmoseKang    | 项目源码 | 文件源码
def _split_doc(self, doc):
        """Given a doc, split it into chunks (by paragraph)."""
        curr = []
        curr_len = 0
        for split in regex.split(r'\n+', doc):
            split = split.strip()
            if len(split) == 0:
                continue
            # Maybe group paragraphs together until we hit a length limit
            if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH:
                yield ' '.join(curr)
                curr = []
                curr_len = 0
            curr.append(split)
            curr_len += len(split)
        if len(curr) > 0:
            yield ' '.join(curr)
项目:Ossian    作者:CSTR-Edinburgh    | 项目源码 | 文件源码
def _process_text_line(self, text):            

        split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \
                            if token != '']
        if self.replace_whitespace:
            new_text = []
            for token in split_text:
                if token.isspace():
                    new_text.append(self.replace_whitespace)                        
                else:
                    new_text.append(token)  
            split_text = new_text

        split_text = [token.strip(u' ') for token in split_text]  ## prevent multiple spaces
        split_text = [token for token in split_text if token != u'']  ## prevent multiple spaces
        split_text = [token.lower() for token in split_text]     ## lowercase
        text = ' '.join(split_text) 
        return text
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def tokenize(self, sentence):
        """ Tokenize the given sentence.
            You can also pass a generic text, but you will lose the sentence segmentation.

            :param str sentence: a natural language sentence or text to be tokenized
            :return: the list of tokens
            :rtype: list
        """
        tokens = regex.split(self.tokenization_regex, unicode(sentence))
        logger.debug("'%s' tokenized into %s using regex %s" % (sentence, tokens, self.tokenization_regex))
        # Skip empty tokens
        return [token for token in tokens if token]
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_re_split(self):
        self.assertEqual(regex.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
        self.assertEqual(regex.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
        self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', 'a', ':',
          'b', '::', 'c'])
        self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
        self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', 'a', ':',
          'b', ':', 'c'])
        self.assertEqual(regex.split("([b:]+)", ":a:b::c"), ['', ':', 'a',
          ':b::', 'c'])
        self.assertEqual(regex.split("(b)|(:+)", ":a:b::c"), ['', None, ':',
          'a', None, ':', '', 'b', None, '', None, '::', 'c'])
        self.assertEqual(regex.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '',
          '', 'c'])

        self.assertEqual(regex.split("x", "xaxbxc"), ['', 'a', 'b', 'c'])
        self.assertEqual([m for m in regex.splititer("x", "xaxbxc")], ['', 'a',
          'b', 'c'])

        self.assertEqual(regex.split("(?r)x", "xaxbxc"), ['c', 'b', 'a', ''])
        self.assertEqual([m for m in regex.splititer("(?r)x", "xaxbxc")], ['c',
          'b', 'a', ''])

        self.assertEqual(regex.split("(x)|(y)", "xaxbxc"), ['', 'x', None, 'a',
          'x', None, 'b', 'x', None, 'c'])
        self.assertEqual([m for m in regex.splititer("(x)|(y)", "xaxbxc")],
          ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c'])

        self.assertEqual(regex.split("(?r)(x)|(y)", "xaxbxc"), ['c', 'x', None,
          'b', 'x', None, 'a', 'x', None, ''])
        self.assertEqual([m for m in regex.splititer("(?r)(x)|(y)", "xaxbxc")],
          ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, ''])

        self.assertEqual(regex.split(r"(?V1)\b", "a b c"), ['', 'a', ' ', 'b',
          ' ', 'c', ''])
        self.assertEqual(regex.split(r"(?V1)\m", "a b c"), ['', 'a ', 'b ',
          'c'])
        self.assertEqual(regex.split(r"(?V1)\M", "a b c"), ['a', ' b', ' c',
          ''])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_qualified_re_split(self):
        self.assertEqual(regex.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
        self.assertEqual(regex.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
        self.assertEqual(regex.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':',
          'b::c'])
        self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':',
          'b::c'])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_bug_931848(self):
        pattern = u"[\u002E\u3002\uFF0E\uFF61]"
        self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b',
          'c'])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_word_class(self):
        self.assertEqual(regex.findall(ur"(?u)\w+",
          u" \u0939\u093f\u0928\u094d\u0926\u0940,"),
          [u'\u0939\u093f\u0928\u094d\u0926\u0940'])
        self.assertEqual(regex.findall(ur"(?u)\W+",
          u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u','])
        self.assertEqual(regex.split(ur"(?uV1)\b",
          u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ',
          u'\u0939\u093f\u0928\u094d\u0926\u0940', u','])
        self.assertEqual(regex.split(ur"(?uV1)\B",
          u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'', u' \u0939',
          u'\u093f', u'\u0928', u'\u094d', u'\u0926', u'\u0940,', u''])
项目:oa_qian    作者:sunqb    | 项目源码 | 文件源码
def test_zerowidth(self):
        # Issue 3262.
        self.assertEqual(regex.split(r"\b", "a b"), ['a b'])
        self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b',
          ''])

        # Issue 1647489.
        self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo',
          'bar'])
        self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")],
          ['', 'foo', 'bar'])
        self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo',
          ''])
        self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+",
          "foo bar")], ['bar', 'foo', ''])
        self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo',
          'bar'])
        self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+",
          "foo bar")], ['', 'foo', 'bar'])
        self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar',
          'foo', ''])
        self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+",
          "foo bar")], ['bar', 'foo', ''])

        self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc'])
        self.assertEqual([m for m in regex.splititer("", "xaxbxc")],
          ['xaxbxc'])

        self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc'])
        self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")],
          ['xaxbxc'])

        self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x',
          'b', 'x', 'c', ''])
        self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['',
          'x', 'a', 'x', 'b', 'x', 'c', ''])

        self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b',
          'x', 'a', 'x', ''])
        self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['',
          'c', 'x', 'b', 'x', 'a', 'x', ''])
项目:backrefs    作者:facelessuser    | 项目源码 | 文件源码
def split(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs):
        """Wrapper for `split`."""

        return regex.split(
            _apply_search_backrefs(pattern, flags), string,
            maxsplit, flags, concurrent, **kwargs
        )
项目:concernCapture    作者:ctfu    | 项目源码 | 文件源码
def splitWord(str):
    str = re.sub("[^A-Za-z]", "", str)
    words = re.split(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-z]A)(?=[A-Z])', str, flags=re.V1)
    return words
项目:Ossian    作者:CSTR-Edinburgh    | 项目源码 | 文件源码
def splitting_function(self, instring):
        tokens = self.regex.split(instring)
        tokens = [t for t in tokens if t != '']
        if self.add_terminal_tokens:
            tokens = [c.TERMINAL] + tokens + [c.TERMINAL]
        return tokens
项目:Ossian    作者:CSTR-Edinburgh    | 项目源码 | 文件源码
def splitting_function(self, instring):
        tokens = self.regex.split(instring)
        tokens = [t for t in tokens if t != '']
        if self.add_terminal_tokens:
            tokens = [c.TERMINAL] + tokens + [c.TERMINAL]
        return tokens
项目:ShadowSocksShare-OpenShift    作者:the0demiurge    | 项目源码 | 文件源码
def request_5752me(url='https://wget.5752.me/Computer/soft/socks5%E4%BB%A3%E7%90%86/%E5%85%8D%E8%B4%B9ss%E8%B4%A6%E5%8F%B7.html'):
    print('req 5752...')
    servers = list()
    try:
        data = requests.get(url)
        if 'IP??' in data.content.decode('gb2312'):
            data = data.content.decode('gb2312')
        elif 'IP??' in data.text:
            data = data.text
        else:
            raise Exception('???5752???' + url)
        info = {'message': '', 'name': '????', 'url': 'https://www.5752.me/'}
        data = data.split('<br/>')

        avail_data = list(filter(lambda x: 'IP??' in x, data))
        if len(avail_data) == 0:
            raise Exception('5752???????????' + '\n'.join(data))

        for i, server in enumerate(avail_data):
            servers.append(dict())
            servers[-1]['remarks'] = '???? {}'.format(i)
            (
                servers[-1]['server'],
                servers[-1]['password'],
                servers[-1]['server_port'],
                servers[-1]['method']) = server.split()[1::2]

    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}
    return servers, info
项目:ShadowSocksShare-OpenShift    作者:the0demiurge    | 项目源码 | 文件源码
def request_nobey(url='https://raw.githubusercontent.com/NoBey/Shadowsocks-free/master/README.md'):
    def strip_dot(x):
        return
    print('req nobey...')
    servers = list()
    try:
        data = re.split('##+|---+', requests.get(url).text)[2:5:2]
        info = {'message': '', 'name': 'NoBey', 'url': 'https://github.com/NoBey/Shadowsocks-free'}

        for i, server in enumerate(data):
            server = server.split('\n')

            name = server[0].strip()
            (
                ips,
                ports,
                _,
                method,
                password) = list(map(
                    lambda server: list(map(
                        lambda x: x.strip().strip('`').strip(),
                        server.strip('-').strip().split()[1:])),
                    server[1:6]))
            method = method[0]
            password = password[0]

            for j, ip in enumerate(ips):
                for k, port in enumerate(ports):
                    servers.append(dict())
                    servers[-1]['remarks'] = 'NoBey {}-{}-{}'.format(name, j, k)
                    (
                        servers[-1]['server'],
                        servers[-1]['password'],
                        servers[-1]['server_port'],
                        servers[-1]['method']) = (ip, password, port, method)

    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}
    return servers, info
项目:ShadowSocksShare-OpenShift    作者:the0demiurge    | 项目源码 | 文件源码
def request_xiaoshuang(url='https://xsjs.yhyhd.org/free-ss'):
    print('req xcud...')
    try:
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')
        data = soup.find('div', attrs={'id': 'ss-body'})
        data = data.text.strip().split('\n\n\n')
        info = {'message': data[0].split('\n')[0], 'name': '????', 'url': url}
        data[0] = data[0].split('\n', maxsplit=1)[-1]
        servers = list()
        for server in data:
            server_data = server.strip().split('\n')
            servers.append(dict())
            servers[-1]['remarks'] = '??{}'.format(server_data[0]).strip()
            servers[-1]['server'] = server_data[1].split()[1].strip()
            servers[-1]['server_port'] = server_data[1].split()[3].strip()
            servers[-1]['password'] = server_data[2].split()[3].strip()
            servers[-1]['method'] = server_data[2].split()[1].strip()
            servers[-1]['ssr_protocol'] = server_data[3].split()[1].split(':')[1].strip()
            servers[-1]['obfs'] = server_data[3].split()[2].split(':')[1].strip()
    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}
    return servers, info

# this cannot use for now
项目:ShadowSocksShare-OpenShift    作者:the0demiurge    | 项目源码 | 文件源码
def request_iss(url='http://ss.ishadowx.com/'):
    print('req iss...')

    try:
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')
    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}

    try:

        info = {
            'message': soup.find('div', attrs={'id': 'portfolio'}).find('div', attrs={'class': 'section-title text-center center'}).text,
            'name': 'ishadowx',
            'url': url}

        '''servers[-1]['name'] = tmp[0]
        servers[-1]['server'] = tmp[0]
        servers[-1]['server_port'] = tmp[0]
        servers[-1]['password'] = tmp[0]
        servers[-1]['method'] = tmp[0]
        servers[-1]['ssr_protocol'] = tmp[0]
        servers[-1]['obfs'] = tmp[0]'''

        soup = BeautifulSoup(data.text, 'html.parser')
        server_data = soup.find_all('div', attrs={'class': 'hover-text'})
        servers = list()
    except Exception as e:
        logging.exception(e, stack_info=True)
        return [], {'message': str(e), 'url': '', 'name': ''}

    for i, server in enumerate(server_data):
        try:
            servers.append(dict())
            server_data = server.text.strip().split('\n')
            servers[-1]['server'] = server_data[0].split(':')[-1].strip()
            servers[-1]['server_port'] = re.findall('\d+', server_data[1])[0]
            servers[-1]['remarks'] = ' '.join(['ss.ishadowx.com', str(i)])
            servers[-1]['password'] = server_data[2].split(':')[-1].strip()
            servers[-1]['method'] = server_data[3].split(':')[-1].strip()
            if 'QR' not in server_data[4]:
                servers[-1]['ssr_protocol'], servers[-1]['obfs'] = server_data[4].strip().split(maxsplit=1)
                servers[-1]['remarks'] = ' '.join([servers[-1]['remarks'], 'SSR'])
        except Exception as e:
            logging.exception(e, stack_info=True)
    return servers, info