Python jieba 模块,tokenize() 实例源码

我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用jieba.tokenize()

项目:nlp-chinese_text_classification    作者:iamiamn    | 项目源码 | 文件源码
def getChList(docStrByte):
    ## ??????????????????????????????????

    inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#?????????????????
    strList = ''.join(inputStr.split('\n'))#????????????????
    rawTokens = list(jieba.tokenize(strList))#????

    #stopWord ? ???????key ???????value??None
    fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
    stopWord = {}.fromkeys(fSW.split('\n'))
    stopWord[''] = None

    final = []
    s = nltk.stem.SnowballStemmer('english')
    for seg in rawTokens:
        # print(seg[0].strip())
        rawWord = seg[0].strip()#strip()?????????????
        if (rawWord.isalpha()):#?????????????
            word = s.stem(rawWord)
        else:
            word = rawWord

        if  word not in stopWord:#?????
            final.append(word)#????list
    return final
项目:PTTChatBot_DL2017    作者:thisray    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:jieba-GAE    作者:liantian-cn    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:my_bit_v1    作者:iSawyer    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:Rasa_NLU_Chi    作者:crownpku    | 项目源码 | 文件源码
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        if config['language'] != 'zh':
            raise Exception("tokenizer_jieba is only used for Chinese. Check your configure json file.")

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))
项目:Rasa_NLU_Chi    作者:crownpku    | 项目源码 | 文件源码
def process(self, message, **kwargs):
        # type: (Message, **Any) -> None

        message.set("tokens", self.tokenize(message.text))
项目:Rasa_NLU_Chi    作者:crownpku    | 项目源码 | 文件源码
def tokenize(self, text):
        # type: (Text) -> List[Token]
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens
项目:http_server    作者:chenguolin    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
项目:http_server    作者:chenguolin    | 项目源码 | 文件源码
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token