Python jieba 模块,analyse() 实例源码

我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用jieba.analyse()

项目:chat    作者:Decalogue    | 项目源码 | 文件源码
def get_tag(sentence, config):
    """
    Get semantic tag of sentence.
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def set_stop_words(self, stop_words_path):
        '''
        @summary: ?????
        ---------
        @param stop_words_path: ???????
        ---------
        @result:
        '''

        abs_path = _get_abs_path(stop_words_path)
        if not os.path.isfile(abs_path):
            raise Exception("jieba: file does not exist: " + abs_path)

        content = open(abs_path, 'rb').read().decode('utf-8')
        for line in content.splitlines():
            self._stop_words.add(line)

        jieba.analyse.set_stop_words(stop_words_path) # analyse?????????
项目:my_bit_v1    作者:iSawyer    | 项目源码 | 文件源码
def text_rank():
    db = query_DB()
    stop_words = load_stopwords()
    for sample in db.get_one():
        author = sample[3]
        title = sample[1]
        content = sample[2]
        reply_number = sample[-1]
        if(author == 'mikki' or author == u'??'):
            continue
        if(reply_number >=3):
            title_seg = jieba.analyse.textrank(title,topK=5,withWeight=True,allowPOS=('ns','n','vn','v'))
            for word,weight in title_seg:
                weight *= 0.7 * (float(reply_number) / max_reply)
                db.write_textrank(word,weight)

        #content_seg = jieba.analyse.textrank(content,topK=8,withWeight=True,allowPOS=('ns','n','vn','v'))
        #for word,weight in content_seg:
            #weight *= 0.3 * (float(reply_number) / max_reply)
            #db.write_textrank(word,weight)
项目:Commodity-analysis    作者:buhuipao    | 项目源码 | 文件源码
def extract_tags(key_word, a_name):
    '''
    ???????????, ????????????,??????,
    ?????????JD??????, ??????????5??????????,
    ???????????????????????????????
    '''
    cut_tags = [tag for tag in jieba.cut(a_name)][:8]
    analyse_tags = jieba.analyse.extract_tags(a_name)
    tags = [tag for tag in cut_tags if tag in analyse_tags]
    # ?????????????tags???
    try:
        tags.remove(key_word)
    except:
        pass
    tags.insert(0, key_word)
    if len(tags) > 5:
        tags = tags[:5]
    return ' '.join(tags)
项目:NewsSpider    作者:lzjqsdd    | 项目源码 | 文件源码
def loadDataFromCutFile(self,totalnum):
        doc = []
        cut = Cut()
        for i in range(1,totalnum):
            line = cut.getRow(i,Global.cutnews_dir,Global.filesize)
            if not line:
                break
            data = json.loads(line)
            keyword = analyse.extract_tags(data['content'],topK=20)
            seg = " ".join(keyword)
            print seg
            doc.append(seg)
        return doc


    #calculate tf-idf
项目:QA    作者:KiddoZhu    | 项目源码 | 文件源码
def __call__(self, question) :
        # print(question.questionSentence)
        qSentence = question.questionSentence
        # question.wordsToken = list(jieba.cut(qSentence))
        question.wordsToken, question.posToken = getPosToken(qSentence)
        assert len(question.wordsToken) == len(question.posToken)
        # print 'Length words Token = %d'%(len(question.wordsToken))
        # print 'Length pos token = %d'%(len(question.posToken))
        question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5))
        # print ' '.join(question.keyWordToken)
        # dependency = parser.parse(words).next()
        # print '/'.join(question.wordsToken)
        # for word, flag in question.posToken:
        #   print('%s %s'%(word, flag))
        question.questionType, question.answerType = getQuestionType(question.questionSentence)
        question.getAnswerTemp()
        # my_print(question.answerTemp)
        # print question.answerRe
项目:http_server    作者:chenguolin    | 项目源码 | 文件源码
def cut_with_weight(self, sentence):
        """
        Cut word string with weight

        @sentence: word string

        return list or None
        ["word1`weight1", "word2`weight2" ...]
        """
        try:
            top_k = 2147483647
            seg_list = jieba.analyse.extract_tags(sentence, topK=top_k, withWeight=True)
            return [item[0].encode('utf-8')+'`'+str(item[1]) for item in seg_list]
        except Exception,e:
            logger.error('cut sentence:[%s] exception:[%s]' % (sentence, str(e)))
            return None
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
项目:LagouJob    作者:EclipseXuLu    | 项目源码 | 文件源码
def get_hot_words(text):
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    jieba.load_userdict(USER_CORPUS)
    df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
    print(df)
    df.to_excel('./hotwords/DM.xlsx', 'DM')
项目:zsky    作者:wenguonideshou    | 项目源码 | 文件源码
def detail(info_hash):
    conn,curr = sphinx_conn()
    querysql='SELECT * FROM film WHERE info_hash=%s'
    curr.execute(querysql,info_hash)
    result=curr.fetchone()
    sphinx_close(curr,conn)
    #hash=Search_Hash.query.filter_by(id=id).first()
    if not result:
        return redirect(url_for('index'))        
    fenci_list=jieba.analyse.extract_tags(result['name'], 8)
    tags=Search_Tags.query.order_by(Search_Tags.id.desc()).limit(20)
    form=SearchForm()
    return render_template('detail.html',form=form,tags=tags,hash=result,fenci_list=fenci_list,sitename=sitename)
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def jieba_textrank(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
    '''
    ??textrank?????????topK???????????????20?
    withWeight????????????????
    allowPOS???????
    '''
    keyword_list = []
    for w in jieba.analyse.textrank(data, topK=20, withWeight=True, allowPOS=allowPOS):
        keyword_list.append(w[0])
    keyword = '/'.join(keyword_list)
    return keyword
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def jieba_tfidf(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')):
    '''
    ??tfidf?????????topK???????????????20?
    withWeight????????????????
    allowPOS???????
    '''
    temp_result = jieba.analyse.extract_tags(
        data, topK, withWeight, allowPOS)
    temp_result = '/'.join(temp_result)
    return temp_result
项目:chat    作者:Decalogue    | 项目源码 | 文件源码
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    ??????????????

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    ????????????????????????

    Args:
        pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
    """
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        result = list(jieba.cut(sentence))
        synonym_vector = [item for item in result if item not in punctuation_all]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in punctuation_all]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in punctuation_all:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector
项目:search    作者:twd2    | 项目源码 | 文件源码
def page_tags(request, pk):
    import jieba.analyse
    page = Page.objects.get(pk=pk)
    tags = jieba.analyse.extract_tags(page.content)
    return render(request, 'tags.html', {'title': 'Tags',
                                         'page': page, 'tags': tags})
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def extarctTextRankKeywords(self, doc_str, window=5):
        ''' ??TextRank???????
            ??: http://www.letiantian.me/2014-12-01-text-rank/
        '''
        keywords = jieba.analyse.textrank(doc_str, withWeight=True)
        return keywords
        pass
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def initTfidfKeywords(self, idf_file=None):
        ''' ??TFIDF???????????????IDF?? '''
        self.words_idf = {}
        if idf_file is not None:
            jieba.analyse.set_idf_path(idf_file)
            '''
            for line in codecs.open(idf_file, 'r', 'utf-8'):
                word, idf_value = line.strip().split()
                self.words_idf[word] = float(idf_value)
            pass
            '''
        pass
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def extractTfidfKeywords(self, doc_str):
        keywords = jieba.analyse.extract_tags(doc_str, withWeight=True)
        return keywords
        pass
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def get_top_words(top, filename):
    topK = top
    content = open(filename, 'rb').read()
    tags = jieba.analyse.extract_tags(content, topK=topK)
    # items = str(tags).replace('u\'', '\'').decode("unicode-escape")
    return tags
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def cut_for_keyword(self, text, with_weight = False, top_keyword_count = None):
        '''
        @summary: ????
        ---------
        @param text: ????
        @param with_weight: ?????? ?????keyword, word_weight?
        @param top_keyword_count: ???N???? None?????
        ---------
        @result:
        '''
        result = jieba.analyse.extract_tags(text, topK = top_keyword_count, withWeight = with_weight)
        return result
项目:Rnews    作者:suemi994    | 项目源码 | 文件源码
def extractKeyWordByTFIDF(self,sentence):
        wordList=[]
        if self.conf["threshold"]:
            threshold=self.conf["threshold"]
            tmpList=jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
            for pair in tmpList:
                if pair[1]>=threshold:
                    wordList.append(pair[0])
        else:
            wordList=list(jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
        return wordList
项目:Rnews    作者:suemi994    | 项目源码 | 文件源码
def extractKeyWordByTextRank(self,sentence):
        wordList=[]
        if self.conf["threshold"]:
            threshold=self.conf["threshold"]
            tmpList=jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"])
            for pair in tmpList:
                if pair[1]>=threshold:
                    wordList.append(pair[0])
        else:
            wordList=list(jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"]))
        return wordList
项目:AIZooService    作者:zhanglbjames    | 项目源码 | 文件源码
def __get_model_answer(self, question):
        tag1 = jieba.analyse.extract_tags(question, 3)
        tag2 = jieba.analyse.textrank(question, 3)
        keywords = []

        for tag in tag1:
            keywords.append(tag)
        for tag in tag2:
            if tag not in tag1:
                keywords.append(tag)

        tr4w = TextRank4Keyword()
        tr4w.analyze(text=question, lower=True, window=2)
        for item in tr4w.get_keywords(20, word_min_len=1):
            if item.word not in keywords:
                keywords.append(item.word)

        kstr = ""
        for k in keywords:
            if len(k) != 1:
                kstr = kstr + "AND" + k
            else:
                if k not in kstr:
                    kstr = kstr + "AND" + k
                    # print(k)
        estr = kstr[3:]
        print (estr)
        q = self.__parser.parse(estr)
        results = self.__searcher.search(q)
        return results
项目:wende    作者:h404bi    | 项目源码 | 文件源码
def keywords_extract(question):
    jieba.analyse.set_stop_words(stopwords)
    rv = jieba.analyse.extract_tags(question, topK=10, withWeight=True)

    return rv
项目:test_jieba    作者:donttal    | 项目源码 | 文件源码
def participle (content):
    tags = jieba.analyse.extract_tags(content, topK=topK)


    print(tags)
    str = '/'.join(tags)
    return str
项目:jieba-GAE    作者:liantian-cn    | 项目源码 | 文件源码
def analyse_tfidf():
    text = request.values.get('text', "text")
    topK = request.values.get("topK", default="20")
    if topK in [str(x) for x in  range(3,41)]:
        topK = int(topK)
    else:
        topK = 20
    withWeight = request.values.get("withWeight", default="0")
    if withWeight in ['0', '1']:
        withWeight = bool(int(withWeight))
    else:
        withWeight = True

    result = list(jieba.analyse.extract_tags(text, topK=topK, withWeight=withWeight))
    return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
项目:jieba-GAE    作者:liantian-cn    | 项目源码 | 文件源码
def analyse_textrank():
    text = request.values.get('text', "text")
    topK = request.values.get("topK", default="20")
    if topK in [str(x) for x in  range(3,41)]:
        topK = int(topK)
    else:
        topK = 20
    withWeight = request.values.get("withWeight", default="0")
    if withWeight in ['0', '1']:
        withWeight = bool(int(withWeight))
    else:
        withWeight = True
    result = list(jieba.analyse.textrank(text, topK=topK, withWeight=withWeight))
    return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
项目:momoCrawler    作者:njames741    | 项目源码 | 文件源码
def get_keywords(self, all_text):
        kw_list = jieba.analyse.extract_tags(all_text, topK=10, withWeight=False, allowPOS=())
        # return set(kw_list)
        for kw in kw_list:
            print kw
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def test_if_has_keyword(self, weibo_text):
        content = weibo_text
        tags = jieba.analyse.extract_tags(content, topK=self.topK)

        for tag in tags:
            if tag in self.mingan_list:
                print("6666666")
                print(content)
                print(tag)
                return True
            else:
                print("no")
        return False
项目:appledaily_hk_hot_keyword_pipeline    作者:howawong    | 项目源码 | 文件源码
def get_keywords(self, content):
        result = pseg.cut(content)
        tags = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('n'))
        tags = [tag for tag in tags if len(tag) > 2]
        return tags
项目:aibot    作者:Qiware    | 项目源码 | 文件源码
def insert_into_reverse_dict(self, hash_val, text):
        """
        ????: ??????
        ????:
            @hash: ??text????
            @text: ??text
        ????: ??????????????20%?, ???????, ????????.
        """
        word_num = 0;
        weight_avg = 0;
        weight_total = 0;

        word_list = []
        weight_list = []

        # ????
        word_with_weight = jieba.analyse.extract_tags(text, withWeight=True)
        for word, weight in word_with_weight:
            word_num += 1;
            weight_total += float(weight);
        if word_num > 0:
            weight_avg = weight_total / word_num;
        for word, weight in word_with_weight:
            if weight < (self.rate * weight_avg):
                break
            word_list.append(word);
            weight_list.append(weight);

        # ???????
        list_len = len(word_list)
        key_list = self.gen_key_list(word_list, weight_list, list_len, self.word_max_len)
        for key in key_list:
            self.reverse_dict.add(key, 100, hash_val); # ????(key -> hash)
项目:neural_markov    作者:LuxxxLucy    | 项目源码 | 文件源码
def key_word_extract(s):
    # for x, w in jieba.analyse.textrank(s, withWeight=True):
    #     print('%s %s' % (x, w))
    # for x, w in jieba.analyse.extract_tags(s, withWeight=True):
    #     print('%s %s' % (x, w))
    return jieba.analyse.textrank(s,withWeight=False)[:10]
项目:focus_of_attention-RNN    作者:luochuwei    | 项目源码 | 文件源码
def get_focus(num_of_post, pid_p_r):
    s = pid_p_r[num_of_post][0]
    for i in pid_p_r[num_of_post][1]:
        s += i
    tfidf_list = jieba.analyse.extract_tags(s, allowPOS = ('ns','n', 'vn', 'v'), withWeight = True)
    text_rank_list = jieba.analyse.textrank(s, allowPOS = ('ns','n', 'vn', 'v'), withWeight = True)
    focus_dic = {}
    for (i,j) in tfidf_list:
        focus_dic[i] = j
    for (i,j) in text_rank_list:
        if i in focus_dic:
            focus_dic[i] += j
        else:
            focus_dic[i] = j
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print num_of_post," post : ", pid_p_r[num_of_post][0]
    print "response : "
    #print focus in response
    for i in pid_p_r[num_of_post][1]:
        word = (' '.join(jieba.cut(i))).split(' ')
        focus_c = []
        for j in word:
            if j in focus_dic:
                focus_c.append((j, focus_dic[j]))
        focus_c = sorted(focus_c, key = lambda x:x[-1], reverse = True)
        if focus_c != []:
            print i.decode('utf-8'),"--> focus is ",focus_c[0][0]
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"




#main
项目:JiaYuan    作者:EclipseXuLu    | 项目源码 | 文件源码
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))
项目:news-search-engine    作者:01joy    | 项目源码 | 文件源码
def construct_dt_matrix(self, files, topK = 200):
        jieba.analyse.set_stop_words(self.stop_words_path)
        jieba.analyse.set_idf_path(self.idf_path)
        M = len(files)
        N = 1
        terms = {}
        dt = []
        for i in files:
            root = ET.parse(self.doc_dir_path + i).getroot()
            title = root.find('title').text
            body = root.find('body').text
            docid = int(root.find('id').text)
            tags = jieba.analyse.extract_tags(title + '?' + body, topK=topK, withWeight=True)
            #tags = jieba.analyse.extract_tags(title, topK=topK, withWeight=True)
            cleaned_dict = {}
            for word, tfidf in tags:
                word = word.strip().lower()
                if word == '' or self.is_number(word):
                    continue
                cleaned_dict[word] = tfidf
                if word not in terms:
                    terms[word] = N
                    N += 1
            dt.append([docid, cleaned_dict])
        dt_matrix = [[0 for i in range(N)] for j in range(M)]
        i =0
        for docid, t_tfidf in dt:
            dt_matrix[i][0] = docid
            for term, tfidf in t_tfidf.items():
                dt_matrix[i][terms[term]] = tfidf
            i += 1

        dt_matrix = pd.DataFrame(dt_matrix)
        dt_matrix.index = dt_matrix[0]
        print('dt_matrix shape:(%d %d)'%(dt_matrix.shape))
        return dt_matrix
项目:baiduAnalyse    作者:baihao8904    | 项目源码 | 文件源码
def comparekw():
    begin_id = 400155662
    for i in range(100):
        id = begin_id + i
        try:
            f = open('./text/%d.html/ask.txt' % id , 'r')
            qstr = f.read().decode('utf-8')
            qkw = jieba.analyse.extract_tags(qstr,5)
            #????????
            list1 = []
            for w in qkw:
                list1.append(str(w.encode('utf-8')))
            print u'??%d.html ?????????' % id
            f.close
        except:
            print u'%d.html ????????' % id
            continue
        try:
            f = open('./text/%d.html/bestanswer.txt' % id , 'r')
            astr = f.read().decode('utf-8')
            akw = jieba.analyse.extract_tags(astr,5)
            #???????
            list2 = []
            for w in akw:
                list2.append(str(w.encode('utf-8')))
            print u'?? %d.html ?????????????' % id
            f.close
        except:
            print u'??????????' + '\n'
            continue
        tmp = [val for val in list1 if val in list2]
        #???????list??????
        if len(tmp) == 0:
            print u'??????' + '\n'
        else:
            print u'???????????? ???30??' + '\n'
            try:
                f = open('./text/%d.html/keyscore.txt' % id , 'r')
                score = int(f.read())
                score = score + 30
                result = str(score)
                f = open('./text/%d.html/keyscore.txt' % id , 'w')
                f.write(result)
                f.close
            except:
                with open('./text/%d.html/keyscore.txt' % id , 'w') as file_saved:
                    text = str(30)
                    file_saved.write(text)