Python jieba.posseg 模块,lcut() 实例源码

我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用jieba.posseg.lcut()

项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def extract_dictionary_feature(file_name, col_tag=0, col_content=1):
    # ????
    adv = codecs.open('./data/vocabulary/adv.txt', 'rb', encoding='utf-8').read().split('\n')
    inverse = codecs.open('./data/vocabulary/inverse.txt', 'rb', encoding='utf-8').read().split('\n')
    negdict = codecs.open('./data/vocabulary/negdict.txt', 'rb', encoding='utf-8').read().split('\n')
    posdict = codecs.open('./data/vocabulary/posdict.txt', 'rb', encoding='utf-8').read().split('\n')

    contents = pd.read_excel(file_name, header=None)

    print 'cut words...'
    cw = lambda x: [pair for pair in psg.lcut(x) if pair.word not in stopwords]
    contents['pairs'] = contents[col_content].apply(cw)
    matrix = reviews2matrix(list(contents['pairs']), posdict, negdict, inverse, adv)
    x = matrix2vec(matrix)
    y = list(contents[col_tag])
    return x, y
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def delNOTNeedWords(content,customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result=''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            result += tempword # +"/"+str(w.flag)+" "  #????
            return_words.append(tempword)
    return result,return_words
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def delNOTNeedWords(content,stopwords):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????

    words = pseg.lcut(content)
    # jieba.cut()
    text_list = []
    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            # text_list.append(word.encode('utf-8'))
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
        # ''.join(text_list)
    return result
    # return ''.join(text_list)
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
    """

    :param raw_str: str
    :return: list[(str, str)]
    """
    res = [(a, b) for a, b in pseg.lcut(raw_str)]

    if filter_invalid_word_flag:
        return filter_invalid_word(res)
    else:
        return res
项目:FAQrobot    作者:ofooo    | 项目源码 | 文件源码
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
        """
        ????????????????????
        simType=simple, simple_POS, vec
        """
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt?simType?????: {}'.format(simType)

        # ??????????????? simple_pos ??
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)

            t.sim = max(
                similarity(in_vec, question, method=simType, embedding=embedding)
                for question in questions
            )
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return '?????????????????????????'

        return maxSim.a
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted, (txt, hyp)
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        global stop_word_path
        self.stop = stop
        self.stopwords = codecs.open(stop_word_path + 'stopwords.txt', encoding='UTF-8').read()
        self.negwords = set([u"?", u"??", u"??", u"?", u"??", u"??", u"??", u"??", u"??"])

        text_words = pseg.lcut(rtepair[0])
        hyp_words = pseg.lcut(rtepair[1])
        self.text_words = set()
        self.hyp_words = set()

        # ??????????????
        pass

        # ?? wordnet ????????
        if lemmatize:
            pass

        # ????
        for word, flag in text_words:
            if word not in self.stopwords:
                self.text_words.add((word, flag))

        for word, flag in hyp_words:
            if word not in self.stopwords:
                self.hyp_words.add((word, flag))

        # ????
        self._overlap = self.hyp_words & self.text_words        # hyp ? text??
        self._hyp_extra = self.hyp_words - self.text_words      # hyp? text??
        self._txt_extra = self.text_words - self.hyp_words      # text? hyp??
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def delstopwords(content):
    result = ''
    words = pseg.lcut("".join(content.split()))
    for word, flag in words:
        if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz",
                                                  "/y"]:  # ??????????????????
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    return result
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def prefix_process(curr_index, sentence, score):
    """
    ?????????????
    :param curr_index:  w ? sentence ??????
    :param score:       ??????
    :param sentence:    ??
    :return:
    """
    num_cnt = 5
    if curr_index - num_cnt > 0:
        seg = sentence[curr_index - num_cnt:curr_index]
    else:
        seg = sentence[0:curr_index]

    # ????????
    for curr_neg_prefix in double_none_prefix:
        if seg.endswith(curr_neg_prefix):
            return 0.8 * score

    # ????????
    for curr_neg_prefix in set_neg_prefix:
        if seg.endswith(curr_neg_prefix):
            temp_pair = pseg.lcut(sentence[0:curr_index])
            for i, (w, f) in enumerate(reversed(temp_pair)):
                if f.startswith(u"x"):
                    break
                elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
                    if (len(temp_pair)-i-2) > 0 and temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
                        return 1.3 * score
            return -1.3 * score

    temp_pair = pseg.lcut(seg)
    for i, (w, f) in enumerate(reversed(temp_pair)):
        if f.startswith(u"x"):
            break
        elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
            if temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
                return -0.6 * score

    # ?????????????
    for curr_very_prefix in set_very_prefix:
        if seg.endswith(curr_very_prefix):
            return 1.3 * score
    return score