Python jieba 模块,cut() 实例源码

我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用jieba.cut()

项目:question-classification-cnn-rnn-attention    作者:sefira    | 项目源码 | 文件源码
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
项目:question-classification-cnn-rnn-attention    作者:sefira    | 项目源码 | 文件源码
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
项目:QA    作者:S-H-Y-GitHub    | 项目源码 | 文件源码
def sentenceToIndex(sentence, word2idx, maxLen):
    """
    ??????????embeddings??????

    :param sentence: ??
    :param word2idx: ?????
    :param maxLen: ???????
    :return: ??????????
    """
    unknown = word2idx.get("UNKNOWN", 0)
    num = word2idx.get("NUM", len(word2idx))
    index = [unknown] * maxLen
    i = 0
    for word in jieba.cut(sentence):
        if word in word2idx:
            index[i] = word2idx[word]
        else:
            if re.match("\d+", word):
                index[i] = num
            else:
                index[i] = unknown
        if i >= maxLen - 1:
            break
        i += 1
    return index
项目:QAServer    作者:fssqawj    | 项目源码 | 文件源码
def bm25(p, titles, answers, scores):
    original_titles = copy.deepcopy(titles)
    titles = [remove_punctuation_re(title) for title in titles]
    answers = [remove_punctuation_re(answer) for answer in answers]
    p = remove_punctuation_re(p)
    titles = [' '.join(jieba.cut(title)) for title in titles]
    p = ' '.join(jieba.cut(p))
    wordindoc, wordindata, doclen, sumlen = init(titles, False)
    global avglen
    avglen = 1.0 * sumlen / N
    res = search(p, zip(titles, original_titles, answers, scores), wordindoc, wordindata, doclen)
    titles, answers, scores = [], [], []
    for key, _ in res:
        titles.append(key[0])
        answers.append(key[1])
        scores.append(key[2])
    return titles, answers, scores
项目:gctag    作者:Fenghuapiao    | 项目源码 | 文件源码
def get_html_text(url):
    response = requests.get(url)
    origin_text = response.text
    origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL)
    origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL)

    doc = html.fromstring(origin_text)
    text = doc.xpath('//body//text()')
    text = [i.strip() for i in text if i.strip()]
    text = ' '.join(text)
    seg = jieba.cut(text)

    stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords()
    seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit()
           and i.strip() not in stopwords]
    seg = ' '.join(seg)

    return seg
项目:tensorflow-deep-qa    作者:shuishen112    | 项目源码 | 文件源码
def overlap_index(question,answer,q_len,a_len,stopwords = []):
    qset = set(cut(question))
    aset = set(cut(answer))

    q_index = np.zeros(q_len)
    a_index = np.zeros(a_len)

    overlap = qset.intersection(aset)
    for i,q in enumerate(cut(question)[:q_len]):
        value = 1
        if q in overlap:
            value = 2
        q_index[i] = value
    for i,a in enumerate(cut(answer)[:a_len]):
        value = 1
        if a in overlap:
            value = 2
        a_index[i] = value
    return q_index,a_index
项目:tensorflow-deep-qa    作者:shuishen112    | 项目源码 | 文件源码
def overlap_index(question,answer,q_len,a_len,stopwords = []):
    qset = set(cut(question))
    aset = set(cut(answer))

    q_index = np.zeros(q_len)
    a_index = np.zeros(a_len)

    overlap = qset.intersection(aset)
    for i,q in enumerate(cut(question)[:q_len]):
        value = 1
        if q in overlap:
            value = 2
        q_index[i] = value
    for i,a in enumerate(cut(answer)[:a_len]):
        value = 1
        if a in overlap:
            value = 2
        a_index[i] = value
    return q_index,a_index
项目:tensorflow-deep-qa    作者:shuishen112    | 项目源码 | 文件源码
def ma_overlap_zi(row):
    question = cut(row["question"])
    answer = cut(row["answer"])

    di_question = []
    di_answer = []
    for w in question:
        for i in range(len(w) ):
            di_question.append(w[i])
    for w in answer:

        for i in range(len(w) ):
            di_answer.append(w[i])

    di_overlap = set(di_question).intersection(set(di_answer) )

    di_weight_p = dict({})
    for k in range(len(di_question) ):
        if di_question[k] in di_overlap:
            # print int(100*((k+1)/(len(question)+1)) )
            di_weight_p[di_question[k] ] =((k+1)/len(di_question))**3.2# zi_weight[ int(100*((k+1)/(len(di_question)+1)) )]#((k+1)/len(di_question))**3.2
    di_weight_all = 0.0
    for k in di_overlap:
        di_weight_all += di_weight_p[k]
    return di_weight_all /(len(di_answer)+40)
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def get_word_count(filename):
    data_source=open(filename,'r')
    data=data_source.read()
    if(data!=''):
        temp_result = jieba.cut(data,cut_all=True)
        temp_result = '/'.join(temp_result)
        word_result=temp_result.split('/')
        word_view={}#word_view[i]?????????????????i?
        for i in word_result:
            word_view[i]=0
            if(i not in word_doc):
                word_doc[i]=0
        for i in word_result:
            if(word_view[i]==0):
                word_view[i]=1;
                word_doc[i]=word_doc[i]+1
项目:JustCopy    作者:exe1023    | 项目源码 | 文件源码
def print2file(f, title, responses, marker = '', separater = True):
    if marker != '':
        f.write(marker + ' ')
    title_cutted = jieba.cut(title.strip(), cut_all=False)
    for word in title_cutted:
        f.write(word + ' ')
    f.write('\n')
    for response in responses:
        #print(response['Content'])
        #if response['Content'] not in count_response.keys():
        #    count_response[response['Content']] = 0
        #count_response[response['Content']] += 1
        if marker != '':
            f.write(marker + ' ')
        response_cutted = jieba.cut(response['Content'].strip(), cut_all=False)
        for word in response_cutted:
            f.write(word + ' ')
        f.write('\n')
    if separater:
        f.write('===\n')
项目:Stock-SentimentAnalysis    作者:JoshuaMichaelKing    | 项目源码 | 文件源码
def word_tokenization(tick_blog_list):
    '''
    word tokenization by jieba to list
    return list : [[,], [,], ...]
    '''
    count = 0
    seg_list = []
    try:
        for blog in tick_blog_list:
            count += 1
            if blog != '':
                segments = jieba.cut(blog)
                tmp = []
                for seg in segments:
                    tmp.append(seg)
                seg_list.append(tmp)
            else:
                print('Line%d is empty!' % cnt)
    except IOError as e:
        logging.error('IOError %s' % e)
    finally:
        return seg_list

#-------------------------------------------------------------------------------
项目:Stock-SentimentAnalysis    作者:JoshuaMichaelKing    | 项目源码 | 文件源码
def word_tokenization(tick_blog_list):
    '''
    word tokenization by jieba to list
    return list : [[,], [,], ...]
    '''
    count = 0
    seg_list = []
    try:
        for blog in tick_blog_list:
            if blog != '':
                count += 1
                segments = jieba.cut(blog)
                tmp = []
                for seg in segments:
                    tmp.append(seg)
                seg_list.append(tmp)
    except IOError as e:
        logging.error('IOError %s' % e)
    finally:
        return seg_list

# Python????????
项目:finance_news_analysis    作者:pskun    | 项目源码 | 文件源码
def word_segment(line, stop=False, remain_number=True):
    '''
    ???????
    stop ??????
    '''
    if STOP_WORDS is None:
        load_stopwords()
    seg_list = jieba.cut(line, HMM=True)
    sl = []
    for word in seg_list:
        word = word.strip()
        if len(word) > 0 and word not in PUNCT:
            if stop:
                if word in STOP_WORDS:
                    word = None
            if word is not None and not remain_number:
                if util_func.atof(word) is not None:
                    word = None
            if word is not None:
                sl.append(word)
    return sl
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def cut_for_property(self, text):
        '''
        @summary: ??????
        ---------
        @param text: ????
        ---------
        @result: ??[(text1, property1)...(textN, propertyN)]
        '''
        words_list = []

        words =pseg.cut(text)
        for word in words:
            if word.word not in self._stop_words:
                words_list.append((word.word, word.flag))

        return words_list
项目:ChineseNER    作者:zjy-ucas    | 项目源码 | 文件源码
def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    """
    seg_feature = []

    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    return seg_feature
项目:warWolf    作者:wu-yy    | 项目源码 | 文件源码
def get_all_keywords(file_name):
    word_lists=[]  #?????
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists=f.readlines()
        for li in Lists:
            cut_list=list(jieba.cut(li))
            for word in cut_list:
                word_lists.append(word)

    word_lists_set=set(word_lists)  #???????
    sort_count=[]
    word_lists_set=list(word_lists_set)

    length=len(word_lists_set)
    print(u'??%d????'%length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
        print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
        k += 1
    with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
        f.writelines(sort_count)
项目:warWolf    作者:wu-yy    | 项目源码 | 文件源码
def get_all_keywords(file_name):
    word_lists=[]  #?????
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists=f.readlines()
        for li in Lists:
            cut_list=list(jieba.cut(li))
            for word in cut_list:
                word_lists.append(word)

    word_lists_set=set(word_lists)  #???????
    sort_count=[]
    word_lists_set=list(word_lists_set)

    length=len(word_lists_set)
    print(u'??%d????'%length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
        print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
        k += 1
    with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
        f.writelines(sort_count)
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def Delete_stopwords():
    print '????????...'
    f_stop = open('emotion_file/stopwords.txt')  # ???????
    f_stop_list = []
    for word in f_stop.readlines():
        f_stop_list.append(word)
    f_stop.close()

    f_text = open("emotion_file/data_zhuguan.txt", "r")   # ????
    f_nostop = codecs.open('emotion_file/data_zhuguan_nostop.txt', 'w', encoding='UTF-8')
    for text in f_text.readlines():  # ??????????????
        f_seg_list = list(jieba.cut(text, cut_all=False))  # ????
        for word in f_seg_list:
            if word in f_stop_list:
                print word
            else:
                f_nostop.write(word)
    f_text.close()
    print"???????..."  # ????


# ??????????????? data_jixing.txt ??????????
项目:LSTM-CRF-For-Named-Entity-Recognition    作者:zpppy    | 项目源码 | 文件源码
def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    """
    seg_feature = []

    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            ## ??????extend????append
            seg_feature.extend(tmp)
    return seg_feature
项目:SentimentAnalysis-chinese-master    作者:Chenalong    | 项目源码 | 文件源码
def jieba_contend_split(contend):
    punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
    wordSequenceList = []  # ???? [[(id,comtend),()....]] ???????????????????
    seg_list = jieba.cut(self.commentSentence)
    segmentedComment = [item for item in seg_list]
    segmentedCommentTuple = list(enumerate(segmentedComment))
    subWordSequenceList = []
    for wordTuple in segmentedCommentTuple:
        if wordTuple[1] in punctuation:
            if subWordSequenceList:
                wordSequenceList.append(subWordSequenceList)
                subWordSequenceList = []
        else:
            subWordSequenceList.append(wordTuple)
    if subWordSequenceList:
        wordSequenceList.append(subWordSequenceList)
    return wordSequenceList
项目:SentimentAnalysis-chinese-master    作者:Chenalong    | 项目源码 | 文件源码
def segByPunc(self):
        punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
        wordSequenceList = []  #???? [[(id,comtend),()....]] ???????????????????
        seg_list = jieba.cut(self.commentSentence)
        segmentedComment = [item for item in seg_list]
        segmentedCommentTuple = list(enumerate(segmentedComment))
        subWordSequenceList = []
        for wordTuple in segmentedCommentTuple:
            if (wordTuple[1] in punctuation):
                if (subWordSequenceList != []):
                    wordSequenceList.append(subWordSequenceList)
                    subWordSequenceList = []
            else:
                subWordSequenceList.append(wordTuple)
        if (subWordSequenceList != []):
            wordSequenceList.append(subWordSequenceList)
        return (wordSequenceList)

    #?????????????????????????
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
    sents = []
    for paragraph in doc.split('\n'):
        sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE))
        for partial_sent in sent_splits:
            sent = partial_sent + next(sent_splits, '')
            if sent.strip() == '': continue
            toks = []
            # for tok in jieba.cut(sent, ):
            for tok in tokenizer(sent):
                pos = 'WORD'
                if tok.strip() == '':
                    pos = 'SPACE'
                elif punct_re.match(tok):
                    pos = 'PUNCT'
                toks.append(Tok(pos,
                                tok[:2].lower(),
                                tok.lower(),
                                tok,
                                ent_type='' if entity_type is None else entity_type.get(tok, ''),
                                tag='' if tag_type is None else tag_type.get(tok, '')))
            sents.append(Sentence(toks, sent))
    return Doc(sents, doc)
项目:text_analysis    作者:mathlf2015    | 项目源码 | 文件源码
def get_result(url_set):
    line_set = []
    for url in url_set:
        wb_data = requests.get(url,headers = headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        a = soup.select('span.ctt')
        for i in range(len(a)):
            text = re.sub('<[^>]*>', '',a[i].text)
            text = re.sub('??', ' ', text)
            text = re.sub('[\W]+', ' ', text)
            line_set.append(text)
            #print(text)
            #writer.writerow((i,text))
    word_list = [" ".join(jieba.cut(sentence)) for sentence in line_set]
    new_text = ' '.join(word_list)
    wordcloud = WordCloud(font_path="C:/Python34/Lib/site-packages/wordcloud/simhei.ttf", background_color="black").generate(new_text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
项目:CNNChineseClassifyer    作者:winnerineast    | 项目源码 | 文件源码
def load_utf8_data_and_labels(positive_data_file, negative_data_file):
    # Load data from files
    positive_data = list(codecs.open(positive_data_file, "r", encoding='utf-8').readlines())
    positive_examples = list()
    for s in positive_data:
        positive_examples.append(" ".join(jieba.cut(s)))

    negative_data = list(codecs.open(negative_data_file, "r", encoding='utf-8').readlines())
    negative_examples = list()
    for s in negative_data:
        negative_examples.append(" ".join(jieba.cut(s)))

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]
项目:Chinese-ChatBot-AIML-Web.py    作者:JingLuo05    | 项目源码 | 文件源码
def test(self, input_str):
        '''
        4?????????????????SVM??????????????
        '''
        test_input = input_str
        x_test = np.zeros(self.count+1)                    #???????
        after_split = " ".join(jieba.cut(test_input))  #??
        words = after_split.split(" ")
        for i in words:
            i = i.replace('\n','')
            i = i.replace('\r','')
            i = i.replace(' ','')
            if self.dictionary.__contains__(i.encode('utf-8')):
                x_test[self.dictionary[i.encode('utf-8')]] = 1.
            # else:
            #     print 'Cannot find: '+i

        #???0????1
        if self.mySVM.predict([x_test]) == 1.:
            return 1
        else:
            return 0
项目:51job    作者:chenjiandongx    | 项目源码 | 文件源码
def post_desc_counter():
        """ ??????
        """
        # import thulac
        post = open(os.path.join("data", "post_require.txt"),
                    "r", encoding="utf-8").read()
        # ?? thulac ??
        # thu = thulac.thulac(seg_only=True)
        # thu.cut(post, text=True)

        # ?? jieba ??
        file_path = os.path.join("data", "user_dict.txt")
        jieba.load_userdict(file_path)
        seg_list = jieba.cut(post, cut_all=False)
        counter = dict()
        for seg in seg_list:
            counter[seg] = counter.get(seg, 1) + 1
        counter_sort = sorted(
            counter.items(), key=lambda value: value[1], reverse=True)
        pprint(counter_sort)
        with open(os.path.join("data", "post_pre_desc_counter.csv"),
                  "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sort)
项目:TiebaTool    作者:ZRStea    | 项目源码 | 文件源码
def calculate_similarity(text1,text2):
    raw1 = jieba.cut(text1)
    raw2 = jieba.cut(text2)
    raw1 = Counter(raw1)
    raw2 = Counter(raw2)
    same_words = set(raw1) & set(raw2)
    if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
        dot_product = 0
        mod1 = 0
        mod2 = 0
        for word in same_words:
            dot_product += raw1[word] * raw2[word]
        for word in raw1:
            mod1 += math.pow(raw1[word],2)
        for word in raw2:
            mod2 += math.pow(raw2[word],2)
        cos = dot_product/math.sqrt(mod1*mod2)
    else:
        cos = 0
    return cos
项目:jieba    作者:isuhao    | 项目源码 | 文件源码
def extract_tags(sentence,topK=20):
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
        if w.lower() in stop_words: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
    freq = [(k,v/total) for k,v in freq.iteritems()]

    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list,reverse=True)

    top_tuples= st_list[:topK]
    tags = [a[1] for a in top_tuples]
    return tags
项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def cut_Text(content, nomial=False):
    """
    :param content: string
    :param nomial: if nomial is True,only noun-like words will remain
    :return:a text which format is 'a   b   c   d'
    """
    if nomial:
        text = ''
        words = pseg.cut(content)
        for word in words:
            if contain(['n'], word.flag):
                text = text + ' ' + word.word
        return text.strip()
    else:
        text = ''
        words = jieba.cut(content)
        for word in words:
            text = text + ' ' + word
        return text.strip()
项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def cut_Dataset(data_set, parrel=False, nomial=False):
    """
    :param data_set:bunch of Dataset
    :param parrel: if it is True,cut dataset in parrel.Windows is not available
    :param nomial: if nomial is True,only noun-like words will remain
    :return:data_set after cutted
    """
    from tqdm import tqdm
    data_cut = []
    start = time.time()
    print('cuting dataset......')
    if parrel:
        p = ThreadPool(9)
        p.map(cut_Text, data_set.data)
        p.close()
        p.join()
    else:
        n=0
        for doc_content in tqdm(data_set.data):
            data_cut.append(cut_Text(doc_content, nomial))
    end = time.time()
    print('cuting  runs %0.2f seconds.' % (end - start))
    data_set.data = data_cut
项目:chinese_text_generator    作者:yiyuezhuo    | 项目源码 | 文件源码
def fetch(self):
        # cut the text in semi-redundant sequences of maxlen characters
        #text=self.text
        text=self.next_text()
        chars=self.chars
        maxlen=self.maxlen
        step=self.step

        maxlen = 20
        step = 3
        sentences = []
        next_chars = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i: i + maxlen])
            next_chars.append(text[i + maxlen])
        print('nb sequences:', len(sentences))

        print('Vectorization...')
        X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
        y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                X[i, t, self.char_indices[char]] = 1
            y[i, self.char_indices[next_chars[i]]] = 1
        return text,X,y
项目:word2vec    作者:sefira    | 项目源码 | 文件源码
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
项目:deeplearning    作者:fanfanfeng    | 项目源码 | 文件源码
def predict(text):
    words = jieba.cut(text)
    words = " ".join(words)
    index2label = {i: l.strip() for i, l in enumerate(tv_classfication.label_list)}

    word2vec_model = Word2Vec.load(tv_classfication.word2vec_path)
    text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None)
    x_test = []
    for doc, _ in text_converter.transform_to_ids([words]):
        x_test.append(doc)

    x_test = np.array(x_test)

    graph = tf.Graph()
    with graph.as_default(),tf.Session() as sess:
        model = bi_lstm_model.Bi_lstm()
        model.restore_model(sess)

        print(tv_classfication.index2label.get(model.predict(sess,x_test)[0]))
项目:web-crawler-tutorial    作者:jwlin    | 项目源码 | 文件源码
def lyrics():
    with open('lyrics.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    tokens = list()
    for v in data.values():
        # ??????, ???????? 2 ??, ?????
        tokens += [seg for seg in jieba.cut(v) if seg.split() and len(seg) > 1]

    # ?? tokens ?????????
    counter = Counter(tokens)
    print(counter.most_common(10))

    # ???, ???????????
    wcloud = WordCloud(font_path='NotoSansMonoCJKtc-Regular.otf').generate(' '.join(tokens))
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()
项目:wiki_zh_vec    作者:zhouhoo    | 项目源码 | 文件源码
def cut_words(input_file, output_file):
    count = 0
    with io.open(output_file, mode = 'w', encoding = 'utf-8') as outfile:
        with io.open(input_file, mode = 'r', encoding = 'utf-8') as infile:
            for line in infile:
                line = line.strip()
                if len(line) < 1:  # empty line
                    continue
                if line.startswith('doc'): # start or end of a passage
                    if line == 'doc': # end of a passage
                        outfile.write(u'\n')
                        count = count + 1
                        if(count % 1000 == 0):
                            print('%s articles were finished.......' %count)
                    continue
                for word in jieba.cut(line):
                    outfile.write(word + ' ')
    print('%s articles were finished.......' %count)
项目:Commodity-analysis    作者:buhuipao    | 项目源码 | 文件源码
def extract_tags(key_word, a_name):
    '''
    ???????????, ????????????,??????,
    ?????????JD??????, ??????????5??????????,
    ???????????????????????????????
    '''
    cut_tags = [tag for tag in jieba.cut(a_name)][:8]
    analyse_tags = jieba.analyse.extract_tags(a_name)
    tags = [tag for tag in cut_tags if tag in analyse_tags]
    # ?????????????tags???
    try:
        tags.remove(key_word)
    except:
        pass
    tags.insert(0, key_word)
    if len(tags) > 5:
        tags = tags[:5]
    return ' '.join(tags)
项目:lyricswordcloud    作者:qwertyyb    | 项目源码 | 文件源码
def handleLine(self, line):
    # ???????
    line = line.replace(' ', '')
    line = line.replace('\n', '')
    line = line.replace('em', '')
    # ??
    words = jieba.cut(line)
    for word in words:
      if len(word)<=1:
        continue
      if word in self.data:
        self.data[word] = self.data[word]+1
      else:
        self.data[word] = 1
项目:question-classification-cnn-rnn-attention    作者:sefira    | 项目源码 | 文件源码
def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line
项目:question-classification-cnn-rnn-attention    作者:sefira    | 项目源码 | 文件源码
def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line
项目:question-classification-cnn-rnn-attention    作者:sefira    | 项目源码 | 文件源码
def process_data(line):
    """
    word break and remove word
    Returns split sentences
    """
    # Word break
    seg_list = jieba.cut(line)
    line = u' '.join(seg_list)
    # Remove word
    ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
    line = u"".join(ss).strip()

    if(len(line) < 2):
        return "UNK"
    return line
项目:hadan-gcloud    作者:youkpan    | 项目源码 | 文件源码
def mainTestInteractive(self, sess):
        """ Try predicting the sentences that the user will enter in the console
        Args:
            sess: The current running session
        """
        # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also)
        # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode)
        # TODO: Log the questions asked for latter re-use (merge with test/samples.txt)

        print('Testing: Launch interactive mode:')
        print('')
        print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high '
              'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.')
        import jieba
        while True:
            question = input(self.SENTENCES_PREFIX[0])
            if question == '' or question == 'exit':
                break
            questionc = jieba.cut(question, cut_all=False)
            question = str(" ".join(questionc)).decoder("GBK")
            print(question)
            questionSeq = []  # Will be contain the question as seen by the encoder
            answer = self.singlePredict(question, questionSeq)
            if not answer:
                print('Warning: sentence too long, sorry. Maybe try a simpler sentence.')
                continue  # Back to the beginning, try again

            print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True)))

            if self.args.verbose:
                print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True))
                print(self.textData.sequence2str(answer))

            print()
项目:hadan-gcloud    作者:youkpan    | 项目源码 | 文件源码
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    #jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    #stopwordset = set()
    #with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
    #    for line in sw:
    #        stopwordset.add(line.strip('\n'))

    output = open('allbook-segment.txt','w')

    texts_num = 0

    with open("allbook.txt", "rb") as f:
      #if(f.readline() == ""):
      print("geting data")
      bookdata = f.read(190000000).decode('UTF-8')
      print("geting data  OK ")
      lineu = bookdata
      p = 0
      for p in range(0,len(bookdata),100):
            line = bookdata[p:p+100]
            #print(line)
            words = jieba.cut(line, cut_all=False)
            for word in words:
                output.write(word +' ')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("???? %d ????" % texts_num)
    output.close()
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def word_seg_cn(docs):
        docs = [list(jieba.cut(sent)) for sent in docs]
        return docs
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def word_seg_cn(docs):
        docs = [list(jieba.cut(sent)) for sent in docs]
        return docs
项目:quackalike    作者:gumblex    | 项目源码 | 文件源码
def cutandsplit(s):
    for ln in filterlist(splitsentence(stripblank(s))):
        l = RE_BRACKETS.sub(brcksub, ln.strip())
        if notchinese(l):
            continue
        yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))
项目:nlputils    作者:The-Orizon    | 项目源码 | 文件源码
def cutandsplit(s):
    for ln in filterlist(splitsentence(stripblank(s))):
        l = RE_BRACKETS.sub(brcksub, ln.strip())
        if notchinese(l):
            continue
        yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))