Python jieba.posseg 模块,cut() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用jieba.posseg.cut()

项目:feng-python-apply    作者:JiangFeng07    | 项目源码 | 文件源码
def parse(self, in_file, out_file):
        output_file = open(out_file, 'w')
        with open(in_file, 'r') as file:
            line = file.readline()
            i = 0
            for line in file.readlines():
                sentence = ""
                line = line.strip().split('\t')
                for word, flag in pseg.cut(line[1].strip()):
                    if flag == 'x':
                        continue
                    else:
                        sentence = sentence + word + " "
                output_file.write(sentence.strip() + "\n")
                i += 1
                if i % 100 == 0:
                    print('Handle lines %d' % i)
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def cut_for_property(self, text):
        '''
        @summary: ??????
        ---------
        @param text: ????
        ---------
        @result: ??[(text1, property1)...(textN, propertyN)]
        '''
        words_list = []

        words =pseg.cut(text)
        for word in words:
            if word.word not in self._stop_words:
                words_list.append((word.word, word.flag))

        return words_list
项目:ugc.aggregator    作者:Dreamcatcher-GIS    | 项目源码 | 文件源码
def extract_keyword(self):
        sents = []
        comm_list = self.dao.get_hotel_comments()
        # ???????????????
        for comm in comm_list:
            sents.extend(normal.get_sentences(comm[2]))
        print "length of sentences:%d"%len(sents)
        # ??????????
        pos_sents = []
        for sent in sents:
            pos_sents.append(pseg.cut(sent))
        print "length of pos_sents:%d"%len(pos_sents)
        # ?????,?????
        print "counting"
        noun_dict = {}
        for pos_sent in pos_sents:
            for key,type in pos_sent:
                if type == "n":
                    if key not in noun_dict:
                        noun_dict[key] = 1
                    else:
                        noun_dict[key] = noun_dict[key] + 1
        a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
        return a
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def handel_weibo(filename):
    fp = open("f://emotion/mysite/Label_extract/weibo_corpus/" + filename, 'r')
    contents = []
    for line in fp.readlines():    # ????
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
    #print "length:", len(contents)
    fp.close()
    # ??????????
    fp_handel = open('f://emotion/mysite/Label_extract/weibo_corpus_handel/handel_' + filename, 'w+')
    for content in contents:
        fp_handel.write(content)
        fp_handel.write('\n')
    fp_handel.close()


# 2.???????????30????????????????
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def read_test_list():
    fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
    contents = []
    for line in fp.readlines():    # ????
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
    fp.close()
    #for w in contents:
    #   print w

    # ??str???????
    str_test = ' '.join(contents)
    return str_test


# 5.??????chinese_weibo.txt??????TF-IDF???????100??
项目:tnpy    作者:ferventdesert    | 项目源码 | 文件源码
def MatchItem(self, input, start, end,muststart, mode=None):
        self.LogIn(input, start,end)
        pos = start;
        if end is None:
            end=len(input);
        seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]);
        for word, flag in seg_list:
            if self.Pos is None:
                sword = word;
                break;
            else:
                if flag in self.Pos:
                    sword = word;
                    break;
            pos += len(word);
        if pos < 0 or (muststart == True and pos != start):
            self.LogOut(None)
            return start + self.Len if self.Len < 0 else tnpy.int_max;
        self.LogOut(sword)
        m = tnpy.MatchResult(self, sword, pos);
        m.rstr = sword;
        return m;
项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def cut_Text(content, nomial=False):
    """
    :param content: string
    :param nomial: if nomial is True,only noun-like words will remain
    :return:a text which format is 'a   b   c   d'
    """
    if nomial:
        text = ''
        words = pseg.cut(content)
        for word in words:
            if contain(['n'], word.flag):
                text = text + ' ' + word.word
        return text.strip()
    else:
        text = ''
        words = jieba.cut(content)
        for word in words:
            text = text + ' ' + word
        return text.strip()
项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def cut_Dataset(data_set, parrel=False, nomial=False):
    """
    :param data_set:bunch of Dataset
    :param parrel: if it is True,cut dataset in parrel.Windows is not available
    :param nomial: if nomial is True,only noun-like words will remain
    :return:data_set after cutted
    """
    from tqdm import tqdm
    data_cut = []
    start = time.time()
    print('cuting dataset......')
    if parrel:
        p = ThreadPool(9)
        p.map(cut_Text, data_set.data)
        p.close()
        p.join()
    else:
        n=0
        for doc_content in tqdm(data_set.data):
            data_cut.append(cut_Text(doc_content, nomial))
    end = time.time()
    print('cuting  runs %0.2f seconds.' % (end - start))
    data_set.data = data_cut
项目:TextClassification    作者:mosu027    | 项目源码 | 文件源码
def splitWord(self, content):

        segs = pseg.cut(str(content))
        result = []
        for word,type in segs:
            WORD = Word()
            if self.wordtypeDict.has_key(word):
                WORD.setword(word)
                WORD.settype(self.wordtypeDict[word])
                WORD.setfreq(self.wordfreqDict[word])
            else:
                WORD.setword(word)
                WORD.settype(type)
                result.append(WORD)
            # print "word ", word
            result.append(WORD)
        return result
项目:nlp    作者:aaronz    | 项目源码 | 文件源码
def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False):
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_tag_filter:
            jieba_result = [
                w for w in jieba_result if w.flag in self.default_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
        word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if strip_stop_words:
            word_list = [word.strip()
                         for word in word_list if word.strip() not in self.stop_words]

        return word_list
项目:easy-expression    作者:earlybackhome    | 项目源码 | 文件源码
def load(self):
        from gensim.models import Word2Vec
        # ???????????
        self.link_database = []
        # ????
        self.vecmodel = Word2Vec.load(self.model_file)
        log.info('???????')
        log.info('???????')
        with open(self.txt_file) as fp:
            senten_list = fp.readlines()
            log.debug("senten%s", senten_list)
            for senten_txt in senten_list:
                self.link_database.append(Senten2vec(senten_txt))
        log.info('???????????')
        for link in self.link_database:
            link.sentence_word = (set(jieba.cut(link.sentence)))

        for link in self.link_database:
            link.sentence_vec = {word for word in link.sentence_word if word in self.vecmodel.wv.index2word}

        log.info('???????')
    # ????????????n???
项目:easy-expression    作者:earlybackhome    | 项目源码 | 文件源码
def juziSim_vec(self, intxt, questionWordset, posWeight=None):  # juziIn??????juziLi???????
        if posWeight == None:
            log.warning('there is no posWeight')
            return 0
        intxtSet = set(list(pseg.cut(intxt)))
        if not len(intxtSet):
            return 0
        simWeight = 0
        totalWeight = 0
        for word, pos in intxtSet:
            if word in self.vecmodel.wv.index2word:
                wordPosWeight = posWeight.get(pos, 1)
                totalWeight += wordPosWeight

                wordMaxWeight = 0
                for t in questionWordset:
                    # print(word, t)
                    tmp = self.vecmodel.wv.similarity(word, t)
                    if wordMaxWeight < tmp:
                        wordMaxWeight = tmp
                simWeight += wordPosWeight * wordMaxWeight
        if totalWeight == 0:
            return 0
        return simWeight/totalWeight
项目:QA    作者:KiddoZhu    | 项目源码 | 文件源码
def __call__(self, question) :
        # print(question.questionSentence)
        qSentence = question.questionSentence
        # question.wordsToken = list(jieba.cut(qSentence))
        question.wordsToken, question.posToken = getPosToken(qSentence)
        assert len(question.wordsToken) == len(question.posToken)
        # print 'Length words Token = %d'%(len(question.wordsToken))
        # print 'Length pos token = %d'%(len(question.posToken))
        question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
        # print ' '.join(question.keyWordToken)
        # dependency = parser.parse(words).next()
        # print '/'.join(question.wordsToken)
        # for word, flag in question.posToken:
        #   print('%s %s'%(word, flag))
        question.questionType, question.answerType = getQuestionType(question.questionSentence)
        question.getAnswerTemp()
        # my_print(question.answerTemp)
        # print question.answerRe
项目:SmartQA    作者:jianke03    | 项目源码 | 文件源码
def ansFind(wikiList, typeInfo, Ques,obj):
    wordList = convert.solve(Ques)
    keyList =  convert.getKeyWords(wordList)
    for j in range(len(wordList)):
        if j >= len(wordList):
            break
        if wordList[j][1].startswith("u") or wordList[j][1].startswith("x") or wordList[j][1].startswith("p"):
            del wordList[j]
            j = j-1

    sourceList = []
    for i in range(len(wikiList)):
        words = pseg.cut(wikiList[i])
        relevantList = []
        for w in words:
            wordsGroup = [w.word,w.flag]
            relevantList.append(wordsGroup)
        sourceList.append(relevantList)

    typeStr = ansExtract.getTypeStr(typeInfo)
    ansList = ansExtract.check(sourceList, wordList, typeStr, typeInfo,obj)
    return ansDecide.chooseAns(ansList, typeStr,typeInfo,obj)
项目:cnn-svm-chinese-text-classification    作者:zpppy    | 项目源码 | 文件源码
def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist
项目:cnn-svm-chinese-text-classification    作者:zpppy    | 项目源码 | 文件源码
def getTrainData(inpath,outfile):
    i=0
    for filename in os.listdir(inpath):

        fw = open(outfile+str(i)+".cut","w")  #???????????
        i=i+1
        file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
        try:
            all_the_text = file_object.read()

            #all_the_text = all_the_text.decode("gb2312").encode("utf-8")
            pre_text = jiebafenci(all_the_text)
            pre_text.encode('UTF-8')

            if len(pre_text)>30:
                fw.write(pre_text)
        except:
            print('@'*20)
            pass
        finally:
            file_object.close()
            fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']
项目:cnn-svm-chinese-text-classification    作者:zpppy    | 项目源码 | 文件源码
def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist
项目:cnn-svm-chinese-text-classification    作者:zpppy    | 项目源码 | 文件源码
def getTrainData(inpath,outfile):
    i=0
    for filename in os.listdir(inpath):

        fw = open(outfile+str(i)+".cut","w")  #???????????
        i=i+1
        file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8')
        try:
            all_the_text = file_object.read()

            #all_the_text = all_the_text.decode("gb2312").encode("utf-8")
            pre_text = jiebafenci(all_the_text)
            pre_text.encode('UTF-8')

            if len(pre_text)>30:
                fw.write(pre_text)
        except:
            print('@'*20)
            pass
        finally:
            file_object.close()
            fw.close()
#['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']
项目:cnn-svm-chinese-text-classification    作者:zpppy    | 项目源码 | 文件源码
def jiebafenci(all_the_text):
    re = ""
    relist = ""
    words = pseg.cut(all_the_text)
    count = 0
    for w in words:
        flag = w.flag  #??
        tmp = w.word   #??
        #print "org: "+tmp
        #\u4e00-\u9fa5?unicode???????????????????
        #???unicode????Unicode???????????????????
        if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and  tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5':
            re = re + " " + w.word
    re = re.replace("\n"," ").replace("\r"," ")   
    if  len(re)>40:
        relist = re
    relist = relist + "\n"
    return relist
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
项目:entity-linker    作者:seucs    | 项目源码 | 文件源码
def jaccard_similarity_score(context1, context2, flag1, flag2):
    #print 'context1', context1
    try:
        if flag1 and len(context1)!=0:
            temp = context1[-1]
            context1.pop()
            context1 += list(pseg.cut(temp))
        if flag2 and len(context1)!=0:
            temp = context2[-1]
            context2.pop()
            context2 += list(pseg.cut(temp))
    except:
        pass

    mySet = set(context1 + context2)
    a1 = []
    a2 = []
    for item in mySet:
        if item in context1:
            a1.append(1)
        else:
            a1.append(0)
        if item in context2:
            a2.append(1)
        else:
            a2.append(0)
    #print sklearn.metrics.jaccard_similarity_score(a1,a2)
    return sklearn.metrics.jaccard_similarity_score(a1,a2)

# element[i]?element[j]?contextSim
项目:sentiment-analysis    作者:kasheemlew    | 项目源码 | 文件源码
def parse():
    """parse the comments"""
    import jieba
    import jieba.posseg as pseg

    # Load User's Dictionary
    path_list = os.getcwd().split('/')
    path_list.append("dict.txt")
    dict_path = '/'.join(path_list)
    jieba.load_userdict(dict_path)

    # Disimss These Flags
    dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
            'ud', 'ug', 'uv']

    comments = Comment.query.all()
    for comment in comments:
         word_list = []
         pseg_cut = pseg.cut(comment.body)
         for word, flag in pseg_cut:
             if flag not in dismiss:
                 word_list.append(word)
         comment.parsed = '/'.join(word_list)
         db.session.add(comment)
         print "Comment %04d Parsed!" % comment.id

    db.session.commit()
    print "ALL DONE!"
项目:chat    作者:Decalogue    | 项目源码 | 文件源码
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    ??????????????

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    ????????????????????????

    Args:
        pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
    """
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        result = list(jieba.cut(sentence))
        synonym_vector = [item for item in result if item not in punctuation_all]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in punctuation_all]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in punctuation_all:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector
项目:JustCopy    作者:exe1023    | 项目源码 | 文件源码
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """????????????list???????

        Keyword arguments:
        lower                  -- ?????????????
        use_stop_words         -- ??True???????????????????
        use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # ??????
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list
项目:PTTChatBot_DL2017    作者:thisray    | 项目源码 | 文件源码
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def cut(filename1,filename2): # ??????seg and pos
    f=open(filename2,'w')
    for line in open(filename1):
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])+'\n'
        f.write(split_line.encode('utf-8'))
    # print '%s split successful' %(filename1)
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def main():
    source_path,target_path=sys.argv[1],sys.argv[2]
    source_files,target_files=getFileList(source_path,target_path)
    # print fileList
    for filename1,filename2 in zip(source_files,target_files):
        cut(filename1,filename2)
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def cut(contents):  # ??
    split_contents=[]
    for line in contents:
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])
        split_contents.append(split_line)
    return split_contents
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def main():
    source_file='law_text.txt'
    law_text_list=readFromFile(source_file)
    print len(law_text_list)

    split_contents=cut(law_text_list)
    # cPickle.dump(split_contents,open('split_law_text.pkl','wb'))
    print len(split_contents)

    # for item in law_text_list:
    #   print item

    print law_text_list[1].strip()
    print split_contents[1].strip()
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def cut(contents):  # ??
    split_contents=[]
    for line in contents:
        res=pseg.cut(line.strip())
        split_line=' '.join([w.word for w in res])
        split_contents.append(split_line)
    return split_contents
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testDefaultCut(self):
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testCutAll(self):
        for content in test_contents:
            result = jieba.cut(content, cut_all=True)
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutAll", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testDefaultCut_NOHMM(self):
        for content in test_contents:
            result = jieba.cut(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testDefaultCut_NOHMM", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = pseg.cut(test_sent, HMM=False)
    for word, flag in result:
        print(word, "/", flag, ", ", end=' ')
    print("")
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for word, flag in result:
        print(word, "/", flag, ", ", end=' ')
    print("")
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
        print(w.word, "/", w.flag, ", ", end=' ')  
    print("")
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        self.pos_filt = frozenset(allowPOS)
        g = UndirectWeightedGraph()
        cm = defaultdict(int)
        words = tuple(self.tokenizer.cut(sentence))
        for i, wp in enumerate(words):
            if self.pairfilter(wp):
                for j in xrange(i + 1, i + self.span):
                    if j >= len(words):
                        break
                    if not self.pairfilter(words[j]):
                        continue
                    if allowPOS and withFlag:
                        cm[(wp, words[j])] += 1
                    else:
                        cm[(wp.word, words[j].word)] += 1

        for terms, w in cm.items():
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        if withWeight:
            tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
        else:
            tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)

        if topK:
            return tags[:topK]
        else:
            return tags
项目:chat_logs_analysis_for_qq    作者:q673230559    | 项目源码 | 文件源码
def get_hot_noun_counts(source_file):
    f = open(source_file, "r")
    data = f.read()
    re_pat = r'[\d-]{10}\s[\d:]{7,8}\s+[^\n]+\d{5,11}\)'  # ?????['2016-06-24 15:42:52  ??(40**21)',…]
    # li=re.findall(re_pat,data)
    li_content = re.split(re_pat, data)
    s = ""
    for l in li_content:
        s = s + l
    seg_list = pseg.cut(s.strip())
    lists = []
    for w in seg_list:
        if (w.flag == "ns"):
            lists.append(w.word)
    # print("******?????**0?kp-****")
    # print("???????",len(lists))
    seg_list_norepeat = set(lists)
    # print("???????",len(seg_list_noRepeat))
    word_set = {}
    for seg in seg_list_norepeat:
        count = 0
        for ss in lists:
            if (ss == seg):
                count += 1
        word_set[seg] = count
    word_tuple_sort = sorted(word_set.items(), key=lambda e: e[1], reverse=True)
    return word_tuple_sort
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def cut(self, text, cut_all = False):
        '''
        @summary: ??
        ---------
        @param text: ??
        @param cut_all: True ??? False ????
          ????????????????????????
          ???????????????????????, ???????????????
        ---------
        @result:
        '''
        result = list(jieba.cut(text, cut_all = cut_all))
        result = self.__del_stop_key(result)
        return result
项目:SentimentPolarityAnalysis    作者:chaoming0625    | 项目源码 | 文件源码
def __is_clause_pattern3(self, the_clause, seg_result):
        for a_phrase in self.__phrase_dict:
            keys = a_phrase.keys()
            to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*")

            if "start" in keys:
                to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}")
            if "head" in keys:
                to_compile = a_phrase["head"] + to_compile

            match = re.compile(to_compile).search(the_clause)
            if match is not None:
                can_continue = True
                pos = [flag for word, flag in posseg.cut(match.group())]
                if "between_tag" in keys:
                    if a_phrase["between_tag"] not in pos and len(pos) > 2:
                        can_continue = False

                if can_continue:
                    for i in range(len(seg_result)):
                        if seg_result[i].word in match.group():
                            try:
                                if seg_result[i + 1].word in match.group():
                                    return self.__emotional_word_analysis(
                                        a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                        [x for x, y in seg_result], i)
                            except IndexError:
                                return self.__emotional_word_analysis(
                                    a_phrase["key"] + ":" + match.group(), a_phrase["value"],
                                    [x for x, y in seg_result], i)
        return ""
项目:ugc.aggregator    作者:Dreamcatcher-GIS    | 项目源码 | 文件源码
def extract_keyword_by_thulac(self):
        sents = []
        comm_list = self.dao.get_hotel_comments()
        # ???????????????
        for comm in comm_list:
            sents.extend(normal.get_sentences(comm[2]))
        print "length of sentences:%d"%len(sents)
        # ??????????
        pos_sents = []
        for sent in sents:
            try:
                pos_sents.append(map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8"))))
            except:
                print sent
                continue
        print "length of pos_sents:%d"%len(pos_sents)
        # ?????,?????
        print "counting"
        noun_dict = {}
        for pos_sent in pos_sents:
            for word in pos_sent:
                if word[1] == "n":
                    if word[0] not in noun_dict:
                        noun_dict[word[0]] = 1
                    else:
                        noun_dict[word[0]] = noun_dict[word[0]] + 1
        a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True)
        return a
项目:AIZooService    作者:zhanglbjames    | 项目源码 | 文件源码
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """????????????list???????

        Keyword arguments:
        lower                  -- ?????????????
        use_stop_words         -- ??True???????????????????
        use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter??????????    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # ??????
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list
项目:KnowledgeGraph-QA-Service    作者:kangzhun    | 项目源码 | 文件源码
def seg(self, sentence):
        words = list()
        tags = list()
        for item in pseg.cut(sentence):
            words.append(item.word)
            tags.append(item.flag)
        return words, tags
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def jieba_cut():
    #??pos_all_dict??
    fp_pos = open("hownet/pos_all_dict.txt", "r")   # ?????????
    fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8')  # ????????????
    contents = fp_pos.readlines()
    for content in contents:
        word = content.decode("utf-8")  # ??
        word_tag = pseg.cut(word)
        str_tag = ""
        for tag in word_tag:
            str_tag += str(tag.word) + '/' + str(tag.flag)
        p = re.compile(r'/x(.*)')
        str_tag = p.sub(r'\1', str_tag)   # ??????
        fp_pos_cut.write(str_tag)
    fp_pos.close()
    fp_pos_cut.close()

    #??pos_all_dict??
    fp_neg = open("hownet/neg_all_dict.txt", "r")   # ?????????
    fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8')  # ????????????
    contents = fp_neg.readlines()
    for content in contents:
        word = content.decode("utf-8")  # ??
        word_tag = pseg.cut(word)
        str_tag = ""
        for tag in word_tag:
            str_tag += str(tag.word) + '/' + str(tag.flag)
        p = re.compile(r'/x(.*)')
        str_tag = p.sub(r'\1', str_tag)  # ??????
        fp_neg_cut.write(str_tag)
    fp_neg.close()
    fp_neg_cut.close()

# ????????????
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def handel_weibo_data():
    #????????????????????
    fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r')
    weibo_data = []   # ?????????????[[??][??][??]]??????????????????
    for line in fp.readlines():    # ????
        contents = []
        line = line.strip()
        line.decode('utf-8')
        seg_lines = pseg.cut(line)  # ????
        for seg_line in seg_lines:   # ??????????
            if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz':
                contents.append(seg_line.word)  # ????
        weibo_data.append(contents)
    fp.close()
    return weibo_data
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def segmentation(sentence):
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
    #print seg_result[:]
    return seg_result

# ??????????????????
项目:Spam-Message-Classifier-sklearn    作者:ZPdesu    | 项目源码 | 文件源码
def build_analyzer(self):
        def analyzer(doc):
            words = pseg.cut(doc)
            new_doc = ''.join(w.word for w in words if w.flag != 'x')
            words = jieba.cut(new_doc)
            return words
        return analyzer


# ?TFID???????