Python jieba.posseg 模块，cut() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用jieba.posseg.cut()。

项目：feng-python-apply 作者：JiangFeng07 | 项目源码 | 文件源码

def parse(self, in_file, out_file):
        output_file = open(out_file, 'w')
        with open(in_file, 'r') as file:
            line = file.readline()
            i = 0
            for line in file.readlines():
                sentence = ""
                line = line.strip().split('\t')
                for word, flag in pseg.cut(line[1].strip()):
                    if flag == 'x':
                        continue
                    else:
                        sentence = sentence + word + " "
                output_file.write(sentence.strip() + "\n")
                i += 1
                if i % 100 == 0:
                    print('Handle lines %d' % i)

项目：internet-content-detection 作者：liubo0621 | 项目源码 | 文件源码

def cut_for_property(self, text):
        '''
        @summary: ??????
        ---------
        @param text: ????
        ---------
        @result: ??[(text1, property1)...(textN, propertyN)]
        '''
        words_list = []

        words =pseg.cut(text)
        for word in words:
            if word.word not in self._stop_words:
                words_list.append((word.word, word.flag))

        return words_list

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def extract_keyword(self):
        sents = []
        comm_list = self.dao.get_hotel_comments()
        # ???????????????
        for comm in comm_list:
            sents.extend(normal.get_sentences(comm[2]))
        print "length of sentences:%d"%len(sents)
        # ??????????
        pos_sents = []
        for sent in sents:
            pos_sents.append(pseg.cut(sent))
        print "length of pos_sents:%d"%len(pos_sents)
        # ?????,?????
        print "counting"
        noun_dict = {}
        for pos_sent in pos_sents:
            for key,type in pos_sent:
                if type == "n":
                    if key not in noun_dict:
                        noun_dict[key] = 1
                    else:
                        noun_dict[key] = noun_dict[key] + 1
        a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
        return a

项目：Graduation-design 作者：Baichenjia | 项目源码 | 文件源码

def handel_weibo(filename):
    fp = open("f://emotion/mysite/Label_extract/weibo_corpus/" + filename, 'r')
    contents = []
    for line in fp.readlines():    # ????
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
    #print "length:", len(contents)
    fp.close()
    # ??????????
    fp_handel = open('f://emotion/mysite/Label_extract/weibo_corpus_handel/handel_' + filename, 'w+')
    for content in contents:
        fp_handel.write(content)
        fp_handel.write('\n')
    fp_handel.close()


# 2.???????????30????????????????

项目：Graduation-design 作者：Baichenjia | 项目源码 | 文件源码

def read_test_list():
    fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
    contents = []
    for line in fp.readlines():    # ????
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
    fp.close()
    #for w in contents:
    #   print w

    # ??str???????
    str_test = ' '.join(contents)
    return str_test


# 5.??????chinese_weibo.txt??????TF-IDF???????100??

项目：tnpy 作者：ferventdesert | 项目源码 | 文件源码

def MatchItem(self, input, start, end,muststart, mode=None):
        self.LogIn(input, start,end)
        pos = start;
        if end is None:
            end=len(input);
        seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
        for word, flag in seg_list:
            if self.Pos is None:
                sword = word;
                break;
            else:
                if flag in self.Pos:
                    sword = word;
                    break;
            pos += len(word);
        if pos < 0 or (muststart == True and pos != start):
            self.LogOut(None)
            return start + self.Len if self.Len < 0 else tnpy.int_max;
        self.LogOut(sword)
        m = tnpy.MatchResult(self, sword, pos);
        m.rstr = sword;
        return m;

项目：Chinese_text_classifier 作者：swordLong | 项目源码 | 文件源码

def cut_Text(content, nomial=False):
    """
    :param content: string
    :param nomial: if nomial is True,only noun-like words will remain
    :return:a text which format is 'a   b   c   d'
    """
    if nomial:
        text = ''
        words = pseg.cut(content)
        for word in words:
            if contain(['n'], word.flag):
                text = text + ' ' + word.word
        return text.strip()
    else:
        text = ''
        words = jieba.cut(content)
        for word in words:
            text = text + ' ' + word
        return text.strip()

项目：Chinese_text_classifier 作者：swordLong | 项目源码 | 文件源码

def cut_Dataset(data_set, parrel=False, nomial=False):
    """
    :param data_set:bunch of Dataset
    :param parrel: if it is True,cut dataset in parrel.Windows is not available
    :param nomial: if nomial is True,only noun-like words will remain
    :return:data_set after cutted
    """
    from tqdm import tqdm
    data_cut = []
    start = time.time()
    print('cuting dataset......')
    if parrel:
        p = ThreadPool(9)
        p.map(cut_Text, data_set.data)
        p.close()
        p.join()
    else:
        n=0
        for doc_content in tqdm(data_set.data):
            data_cut.append(cut_Text(doc_content, nomial))
    end = time.time()
    print('cuting  runs %0.2f seconds.' % (end - start))
    data_set.data = data_cut

项目：TextClassification 作者：mosu027 | 项目源码 | 文件源码

def splitWord(self, content):

        segs = pseg.cut(str(content))
        result = []
        for word,type in segs:
            WORD = Word()
            if self.wordtypeDict.has_key(word):
                WORD.setword(word)
                WORD.settype(self.wordtypeDict[word])
                WORD.setfreq(self.wordfreqDict[word])
            else:
                WORD.setword(word)
                WORD.settype(type)
                result.append(WORD)
            # print "word ", word
            result.append(WORD)
        return result

项目：nlp 作者：aaronz | 项目源码 | 文件源码

def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False):
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_tag_filter:
            jieba_result = [
                w for w in jieba_result if w.flag in self.default_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
        word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if strip_stop_words:
            word_list = [word.strip()
                         for word in word_list if word.strip() not in self.stop_words]

        return word_list

项目：easy-expression 作者：earlybackhome | 项目源码 | 文件源码

def load(self):
        from gensim.models import Word2Vec
        # ???????????
        self.link_database = []
        # ????
        self.vecmodel = Word2Vec.load(self.model_file)
        log.info('???????')
        log.info('???????')
        with open(self.txt_file) as fp:
            senten_list = fp.readlines()
            log.debug("senten%s", senten_list)
            for senten_txt in senten_list:
                self.link_database.append(Senten2vec(senten_txt))
        log.info('???????????')
        for link in self.link_database:
            link.sentence_word = (set(jieba.cut(link.sentence)))

        for link in self.link_database:
            link.sentence_vec = {word for word in link.sentence_word if word in self.vecmodel.wv.index2word}

        log.info('???????')
    # ????????????n???

项目：easy-expression 作者：earlybackhome | 项目源码 | 文件源码

def juziSim_vec(self, intxt, questionWordset, posWeight=None):  # juziIn??????juziLi???????
        if posWeight == None:
            log.warning('there is no posWeight')
            return 0
        intxtSet = set(list(pseg.cut(intxt)))
        if not len(intxtSet):
            return 0
        simWeight = 0
        totalWeight = 0
        for word, pos in intxtSet:
            if word in self.vecmodel.wv.index2word:
                wordPosWeight = posWeight.get(pos, 1)
                totalWeight += wordPosWeight

                wordMaxWeight = 0
                for t in questionWordset:
                    # print(word, t)
                    tmp = self.vecmodel.wv.similarity(word, t)
                    if wordMaxWeight < tmp:
                        wordMaxWeight = tmp
                simWeight += wordPosWeight * wordMaxWeight
        if totalWeight == 0:
            return 0
        return simWeight/totalWeight

项目：QA 作者：KiddoZhu | 项目源码 | 文件源码

def __call__(self, question) :
        # print(question.questionSentence)
        qSentence = question.questionSentence
        # question.wordsToken = list(jieba.cut(qSentence))
        question.wordsToken, question.posToken = getPosToken(qSentence)
        assert len(question.wordsToken) == len(question.posToken)
        # print 'Length words Token = %d'%(len(question.wordsToken))
        # print 'Length pos token = %d'%(len(question.posToken))
        question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
        # print ' '.join(question.keyWordToken)
        # dependency = parser.parse(words).next()
        # print '/'.join(question.wordsToken)
        # for word, flag in question.posToken:
        #   print('%s %s'%(word, flag))
        question.questionType, question.answerType = getQuestionType(question.questionSentence)
        question.getAnswerTemp()
        # my_print(question.answerTemp)
        # print question.answerRe

项目：SmartQA 作者：jianke03 | 项目源码 | 文件源码

def ansFind(wikiList, typeInfo, Ques,obj):
    wordList = convert.solve(Ques)
    keyList =  convert.getKeyWords(wordList)
    for j in range(len(wordList)):
        if j >= len(wordList):
            break
        if wordList[j][1].startswith("u") or wordList[j][1].startswith("x") or wordList[j][1].startswith("p"):
            del wordList[j]
            j = j-1

    sourceList = []
    for i in range(len(wikiList)):
        words = pseg.cut(wikiList[i])
        relevantList = []
        for w in words:
            wordsGroup = [w.word,w.flag]
            relevantList.append(wordsGroup)
        sourceList.append(relevantList)

    typeStr = ansExtract.getTypeStr(typeInfo)
    ansList = ansExtract.check(sourceList, wordList, typeStr, typeInfo,obj)
    return ansDecide.chooseAns(ansList, typeStr,typeInfo,obj)

项目：cnn-svm-chinese-text-classification 作者：zpppy | 项目源码 | 文件源码

def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist

项目：cnn-svm-chinese-text-classification 作者：zpppy | 项目源码 | 文件源码

def getTrainData(inpath,outfile):
    i=0
    for filename in os.listdir(inpath):

        fw = open(outfile+str(i)+".cut","w")  #???????????
        i=i+1
        file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
        try:
            all_the_text = file_object.read()

            #all_the_text = all_the_text.decode("gb2312").encode("utf-8")
            pre_text = jiebafenci(all_the_text)
            pre_text.encode('UTF-8')

            if len(pre_text)>30:
                fw.write(pre_text)
        except:
            print('@'*20)
            pass
        finally:
            file_object.close()
            fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']

项目：cnn-svm-chinese-text-classification 作者：zpppy | 项目源码 | 文件源码

def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist

项目：cnn-svm-chinese-text-classification 作者：zpppy | 项目源码 | 文件源码

def getTrainData(inpath,outfile):
    i=0
    for filename in os.listdir(inpath):

        fw = open(outfile+str(i)+".cut","w")  #???????????
        i=i+1
        file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
        try:
            all_the_text = file_object.read()

            #all_the_text = all_the_text.decode("gb2312").encode("utf-8")
            pre_text = jiebafenci(all_the_text)
            pre_text.encode('UTF-8')

            if len(pre_text)>30:
                fw.write(pre_text)
        except:
            print('@'*20)
            pass
        finally:
            file_object.close()
            fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']

项目：cnn-svm-chinese-text-classification 作者：zpppy | 项目源码 | 文件源码

def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist

项目：CNKICrawler 作者：roliygu | 项目源码 | 文件源码

def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag

项目：entity-linker 作者：seucs | 项目源码 | 文件源码

def jaccard_similarity_score(context1, context2, flag1, flag2):
    #print 'context1', context1
    try:
        if flag1 and len(context1)!=0:
            temp = context1[-1]
            context1.pop()
            context1 += list(pseg.cut(temp))
        if flag2 and len(context1)!=0:
            temp = context2[-1]
            context2.pop()
            context2 += list(pseg.cut(temp))
    except:
        pass

    mySet = set(context1 + context2)
    a1 = []
    a2 = []
    for item in mySet:
        if item in context1:
            a1.append(1)
        else:
            a1.append(0)
        if item in context2:
            a2.append(1)
        else:
            a2.append(0)
    #print sklearn.metrics.jaccard_similarity_score(a1,a2)
    return sklearn.metrics.jaccard_similarity_score(a1,a2)

# element[i]?element[j]?contextSim

项目：sentiment-analysis 作者：kasheemlew | 项目源码 | 文件源码

def parse():
    """parse the comments"""
    import jieba
    import jieba.posseg as pseg

    # Load User's Dictionary
    path_list = os.getcwd().split('/')
    path_list.append("dict.txt")
    dict_path = '/'.join(path_list)
    jieba.load_userdict(dict_path)

    # Disimss These Flags
    dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
            'ud', 'ug', 'uv']

    comments = Comment.query.all()
    for comment in comments:
         word_list = []
         pseg_cut = pseg.cut(comment.body)
         for word, flag in pseg_cut:
             if flag not in dismiss:
                 word_list.append(word)
         comment.parsed = '/'.join(word_list)
         db.session.add(comment)
         print "Comment %04d Parsed!" % comment.id

    db.session.commit()
    print "ALL DONE!"

项目：chat 作者：Decalogue | 项目源码 | 文件源码

def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    ??????????????

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    ????????????????????????

    Args:
        pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
    """
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        result = list(jieba.cut(sentence))
        synonym_vector = [item for item in result if item not in punctuation_all]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in punctuation_all]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in punctuation_all:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector

项目：JustCopy 作者：exe1023 | 项目源码 | 文件源码

def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """????????????list???????

        Keyword arguments:
        lower                  -- ?????????????
        use_stop_words         -- ??True???????????????????
        use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # ??????
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list

项目：PTTChatBot_DL2017 作者：thisray | 项目源码 | 文件源码

def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

项目：ChineseSA 作者：cwlseu | 项目源码 | 文件源码

def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def cut(filename1,filename2): # ??????seg and pos
    f=open(filename2,'w')
    for line in open(filename1):
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])+'\n'
        f.write(split_line.encode('utf-8'))
    # print '%s split successful' %(filename1)

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def main():
    source_path,target_path=sys.argv[1],sys.argv[2]
    source_files,target_files=getFileList(source_path,target_path)
    # print fileList
    for filename1,filename2 in zip(source_files,target_files):
        cut(filename1,filename2)

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def cut(contents):  # ??
    split_contents=[]
    for line in contents:
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])
        split_contents.append(split_line)
    return split_contents

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def main():
    source_file='law_text.txt'
    law_text_list=readFromFile(source_file)
    print len(law_text_list)

    split_contents=cut(law_text_list)
    # cPickle.dump(split_contents,open('split_law_text.pkl','wb'))
    print len(split_contents)

    # for item in law_text_list:
    #   print item

    print law_text_list[1].strip()
    print split_contents[1].strip()

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def cut(contents):  # ??
    split_contents=[]
    for line in contents:
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])
        split_contents.append(split_line)
    return split_contents