Python jieba 模块,load_userdict() 实例源码

我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用jieba.load_userdict()

项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def __init__(self, dict_path = ''):
        super(Singleton, self).__init__()
        if not hasattr(self,'_stop_words'):
            #???????
            if dict_path:
                jieba.load_userdict(dict_path)

            self._stop_words = set((
                '', ' ', '\n', "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
                "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
                "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
            ))
项目:51job    作者:chenjiandongx    | 项目源码 | 文件源码
def post_desc_counter():
        """ ??????
        """
        # import thulac
        post = open(os.path.join("data", "post_require.txt"),
                    "r", encoding="utf-8").read()
        # ?? thulac ??
        # thu = thulac.thulac(seg_only=True)
        # thu.cut(post, text=True)

        # ?? jieba ??
        file_path = os.path.join("data", "user_dict.txt")
        jieba.load_userdict(file_path)
        seg_list = jieba.cut(post, cut_all=False)
        counter = dict()
        for seg in seg_list:
            counter[seg] = counter.get(seg, 1) + 1
        counter_sort = sorted(
            counter.items(), key=lambda value: value[1], reverse=True)
        pprint(counter_sort)
        with open(os.path.join("data", "post_pre_desc_counter.csv"),
                  "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sort)
项目:LagouJob    作者:EclipseXuLu    | 项目源码 | 文件源码
def get_hot_words(text):
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    jieba.load_userdict(USER_CORPUS)
    df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
    print(df)
    df.to_excel('./hotwords/DM.xlsx', 'DM')
项目:sentiment-analysis    作者:kasheemlew    | 项目源码 | 文件源码
def parse():
    """parse the comments"""
    import jieba
    import jieba.posseg as pseg

    # Load User's Dictionary
    path_list = os.getcwd().split('/')
    path_list.append("dict.txt")
    dict_path = '/'.join(path_list)
    jieba.load_userdict(dict_path)

    # Disimss These Flags
    dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
            'ud', 'ug', 'uv']

    comments = Comment.query.all()
    for comment in comments:
         word_list = []
         pseg_cut = pseg.cut(comment.body)
         for word, flag in pseg_cut:
             if flag not in dismiss:
                 word_list.append(word)
         comment.parsed = '/'.join(word_list)
         db.session.add(comment)
         print "Comment %04d Parsed!" % comment.id

    db.session.commit()
    print "ALL DONE!"
项目:seq2seq_chatterbot    作者:StephenLee2016    | 项目源码 | 文件源码
def __init__(self):
        self.encoderFile = "./question.txt"
        self.decoderFile = './answer.txt'
        self.dictFile = 'word_dict.txt'
        # ???????????
        jieba.load_userdict(self.dictFile)
        # ???????
        self.stopwordsFile = "./preprocessing/stopwords.dat"
项目:seq2seq_chatterbot    作者:StephenLee2016    | 项目源码 | 文件源码
def __init__(self):
        print("tensorflow version: ", tf.__version__)
        tf.reset_default_graph()

        self.encoder_vec_file = "./preprocessing/enc.vec"
        self.decoder_vec_file = "./preprocessing/dec.vec"
        self.encoder_vocabulary = "./preprocessing/enc.vocab"
        self.decoder_vocabulary = "./preprocessing/dec.vocab"
        self.dictFile = './word_dict.txt'
        self.batch_size = 1
        self.max_batches = 10000
        self.show_epoch = 100
        self.model_path = './model/'

        # jieba????
        jieba.load_userdict(self.dictFile)

        self.model = dynamicSeq2seq(encoder_cell=LSTMCell(20),
                                    decoder_cell=LSTMCell(40), 
                                    encoder_vocab_size=540,
                                    decoder_vocab_size=1600,
                                    embedding_size=20,
                                    attention=True,
                                    bidirectional=True,
                                    debug=False,
                                    time_major=True)
        self.location = ["??", "??", "??", "??","??"]
        self.user_info = {"__username__":"Stephen", "__location__":"??"}
        self.robot_info = {"__robotname__":"JiJi"}
        self.dec_vocab = {}
        self.enc_vocab = {}
        tag_location = ''
        with open(self.encoder_vocabulary, "r") as enc_vocab_file:
            for index, word in enumerate(enc_vocab_file.readlines()):
                self.enc_vocab[word.strip()] = index
        with open(self.decoder_vocabulary, "r") as dec_vocab_file:
            for index, word in enumerate(dec_vocab_file.readlines()):
                self.dec_vocab[index] = word.strip()
项目:free-rider-killer    作者:YukiSora    | 项目源码 | 文件源码
def main(argv):
    f = open('freeRiderData.txt')
    jieba.load_userdict('KeywordDictionary.txt')
    for line in f:
        # ????
        seg_list = jieba.cut(line, cut_all=False)
        print("Default Mode: " + "/ ".join(seg_list))   

    return
项目:SentimentPolarityAnalysis    作者:chaoming0625    | 项目源码 | 文件源码
def __init__(self):
        self.__root_filepath = "f_dict/"

        jieba.load_userdict("f_dict/user.dict")  # ??????

        # ????????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
项目:FineGrainedOpinionMining    作者:chaoming0625    | 项目源码 | 文件源码
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word(u"??", 10000)
    jieba.suggest_freq((u"?", u"??"))
    jieba.suggest_freq((u"??", u"??"))
    jieba.suggest_freq((u"??", u"??"))
    jieba.suggest_freq((u"??", u"?"))
项目:FineGrainedOpinionMining    作者:chaoming0625    | 项目源码 | 文件源码
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("??", 10000)
    jieba.suggest_freq(("?", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "?"))
项目:Rnews    作者:suemi994    | 项目源码 | 文件源码
def __init__(self,userDict=None,conf={}):
        self.userDict=userDict
        self.conf={}
        self.configFromDict(conf)
        if self.userDict:
            jieba.load_userdict(userDict)
        self.configDefault()
项目:KnowledgeGraph-QA-Service    作者:kangzhun    | 项目源码 | 文件源码
def __init__(self, custom_dict_path=CUSTOM_DICTIONARY_PATH):
        super(JiebaClient, self).__init__()
        try:
            jieba.load_userdict(custom_dict_path)
            self.debug("init JiebaClient, with custom_dict_path=%s", custom_dict_path)
        except Exception, e:
            self.exception(e)
            self.error('@@@@@@@@@@@@@@@@@@@@@@@@@@@ loading custom_dictionary failed')
项目:sentiment-analysis    作者:l-passer    | 项目源码 | 文件源码
def cutwords_jieba(self,sentence,userdict='dict/userdict.txt',stopwords='dict/stopwords.txt'):
        stropw = []
        if userdict:
            jieba.load_userdict(userdict)
            stropw = [line.strip() for line in open(stopwords,'r',encoding='utf-8').readlines()]

        frequency = defaultdict(int)
        l = list(jieba.cut(sentence))
        for t in l:
            frequency[t] += 1

        texts = [token for token in frequency if frequency[token] > 0]

        rtexts = list(set(texts)-set(stropw))
        return rtexts
项目:TPTM    作者:Wind-Ward    | 项目源码 | 文件源码
def read(self,file_name,POS_tag):
        f = open(file_name, "r")
        tempLine=[]
        #vocabulary = {}
        jieba.load_userdict("data/metadata/user_dict.txt")
        for lineNo,line in enumerate(f.readlines()):
            pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
            m=pattern.match(line)
            if m:
                info=m.group(1).split(',')
                temp={"time":int(float(info[0])), \
                                   "text":[word  for word,flag in pseg.cut(m.group(2))  \
                                           if word not in self.stop_words and flag not in \
                                           POS_tag ],
                                   "lineno":lineNo+1,
                                   "user":info[6]}

                #?????? ???????>3???
                temp2=[]
                for index,text in enumerate(temp["text"]):
                    if len(text)>1:
                        temp2.append(text)
                if len(temp2)>=3:
                    print(temp2)
                    temp["text"]=temp2
                    tempLine.append(temp)


        lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
        print len(lines)
        return lines#,vocabulary
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def __init__(self):
        self.ut_path = '../data/ut.data'
        self.vocab_path = '../data/vocab.data'
        self.ids_path = '../data/ids.data'
        self.train_path = '../data/train.data'
        self.dev_path = '../data/dev.data'
        self.test_path = '../data/test.data'
        self.dict_path = '../data/medical.txt'
        self.emd_path = '../data/emd/ylemd.bin'
        self.tag_path = '../data/tag.data'
        jieba.load_userdict(self.dict_path)
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def __init__(self):
        self.ut_path = '../data/uterance.data'
        self.mark_path = '../data/mark.data'
        self.vocab_path = '../data/vocab.data'
        self.ids_path = '../data/ids.data'
        self.train_path = '../data/train.data'
        self.dev_path = '../data/dev.data'
        self.test_path = '../data/test.data'
        self.dict_path = '../data/medical.txt'
        self.emd_path = '../data/emd/ylemd.bin'
        jieba.load_userdict(self.dict_path)
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def __init__(self,size):
        self.data_path = 'skin.data'
        self.train_size = int(size*0.7)
        self.dev_size = int(size*0.1)
        self.test_size = size - self.train_size - self.dev_size
        jieba.load_userdict('medical.txt')
        self.sentences = []
        self.orders = []
        self.stop_line = []
        for line in open('goodbye.data'):
            line = line.strip()
            self.stop_line.append(line)
        self.ac_dialogs = []
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def __init__(self):
        jieba.load_userdict("keyword.txt")
        jieba.load_userdict("mingan_word.txt")
        self.topK = 12
        self.mingan_list = []
        self.get_mingan_list()
项目:dynamic-seq2seq    作者:yanwii    | 项目源码 | 文件源码
def __init__(self):
        self.encoderFile = "./question.txt"
        self.decoderFile = './answer.txt'
        self.dictFile = 'word_dict.txt'
        jieba.load_userdict(self.dictFile)
        self.stopwordsFile = "./preprocessing/stopwords.dat"
项目:dynamic-seq2seq    作者:yanwii    | 项目源码 | 文件源码
def __init__(self):
        print("tensorflow version: ", tf.__version__)
        tf.reset_default_graph()

        self.encoder_vec_file = "./preprocessing/enc.vec"
        self.decoder_vec_file = "./preprocessing/dec.vec"
        self.encoder_vocabulary = "./preprocessing/enc.vocab"
        self.decoder_vocabulary = "./preprocessing/dec.vocab"
        self.dictFile = './word_dict.txt'
        self.batch_size = 1
        self.max_batches = 100000
        self.show_epoch = 100
        self.model_path = './model/'

        # jieba????
        jieba.load_userdict(self.dictFile)

        self.model = dynamicSeq2seq(encoder_cell=LSTMCell(40),
                                    decoder_cell=LSTMCell(40), 
                                    encoder_vocab_size=600,
                                    decoder_vocab_size=1600,
                                    embedding_size=20,
                                    attention=False,
                                    bidirectional=False,
                                    debug=False,
                                    time_major=True)
        self.location = ["??", "??", "??", "??"]
        self.user_info = {"__username__":"yw", "__location__":"??"}
        self.robot_info = {"__robotname__":"Rr"}
        self.dec_vocab = {}
        self.enc_vocab = {}
        self.dec_vecToSeg = {}
        tag_location = ''
        with open(self.encoder_vocabulary, "r") as enc_vocab_file:
            for index, word in enumerate(enc_vocab_file.readlines()):
                self.enc_vocab[word.strip()] = index
        with open(self.decoder_vocabulary, "r") as dec_vocab_file:
            for index, word in enumerate(dec_vocab_file.readlines()):
                self.dec_vecToSeg[index] = word.strip()
                self.dec_vocab[word.strip()] = index
项目:zhNewsCrawler    作者:YCKung    | 项目源码 | 文件源码
def cut_main():
    jieba.set_dictionary('dict.txt.big')
    #jieba.load_userdict("userdict.txt")
    if len(sys.argv) == 3:
        inputfile = sys.argv[1]
        outputfile = sys.argv[2]
    else:
        print "Usage: python cut.py filetoCut.txt cuttedFile.txt"
        sys.exit()
    readNcut(inputfile,outputfile)
项目:zhNewsCrawler    作者:YCKung    | 项目源码 | 文件源码
def cut_main(inputfile,outputfile):
    jieba.set_dictionary('dict.txt.big')
    #-----user define dict-----
    #jieba.load_userdict("userdict.txt")
    readNcut(inputfile,outputfile)
项目:aibot    作者:Qiware    | 项目源码 | 文件源码
def load_userdict():
    """
    Load user dictionary
    """
    # ????
    jieba.load_userdict("./dict/name/amuse.txt");
    jieba.load_userdict("./dict/name/sporter.txt");
    jieba.load_userdict("./dict/name/politicians.txt");

    # ????
    jieba.load_userdict("./dict/sport.txt"); # ????

    # ????
    jieba.load_userdict("./dict/dict.txt");
项目:aibot    作者:Qiware    | 项目源码 | 文件源码
def load_userdict():
    # ????
    jieba.load_userdict("./dict/name/amuse.txt");
    jieba.load_userdict("./dict/name/sporter.txt");
    jieba.load_userdict("./dict/name/politicians.txt");

    # ????
    jieba.load_userdict("./dict/sport.txt"); # ????

    # ????
    jieba.load_userdict("./dict/dict.txt");
项目:CloudMusic-Crawler    作者:GreatV    | 项目源码 | 文件源码
def words_split(corpus_path):

    with open(corpus_path, 'r') as f:
        content = f.read()

    jieba.load_userdict('data/userdict.txt') # ?????????
    jieba.enable_parallel(4) # ????


    seg_list = jieba.cut(content, cut_all = False) # ??

    return seg_list


# ?????
项目:seq2seq    作者:yanwii    | 项目源码 | 文件源码
def __init__(self):
        #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
        #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
        #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
        self.encoderFile = "./data/question.txt"
        self.decoderFile = "./data/answer.txt"
        self.savePath = './data/'

        jieba.load_userdict("./data/supplementvocab.txt")
项目:GeoNews    作者:chunlaw    | 项目源码 | 文件源码
def __init__(self, diction=None, content=None):
        self.diction = diction or "assets/location.dict"
        self.content = content or ""
        jieba.load_userdict(self.diction)
项目:FusionOfMultipleClassifers    作者:chaoming0625    | 项目源码 | 文件源码
def __init__(self):
        self.__root_filepath = "f_dict/"

        jieba.load_userdict("f_dict/user.dict")  # ??????

        # ????????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
项目:KnowledgeGraph    作者:SilverHelmet    | 项目源码 | 文件源码
def gen_dataset_from_baike():
    doc_path = os.path.join(rel_ext_dir, 'sample_baike_doc.json')
    out_path = os.path.join(rel_ext_dir, 'data/raw_dataset.txt')

    name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache')
    fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache')
    finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path)


    Print('load userdict')
    jieba.load_userdict(os.path.join(rel_ext_dir, 'trimmed_baike_dict.txt'))

    Print('gen dataset from [%s]' %doc_path)
    outf = file(out_path, 'w')
    for line in tqdm(file(doc_path), total = nb_lines_of(doc_path)):
        p = line.split('\t')
        baike_url = p[0].decode('utf-8')
        paragraphs = json.loads(p[1])
        for paragraph in paragraphs:
            sentences = split_sentences(paragraph)
            for sentence in sentences:
                cases, words = gen_dataset(sentence, finder)
                if len(cases) > 0:
                    out_obj = {
                        'words': "#".join(words),
                        'cases': map(str, cases),
                    }
                    outf.write("%s\t%s\n" %(baike_url, json.dumps(out_obj, ensure_ascii = False)))
    outf.close()
项目:JiaYuan    作者:EclipseXuLu    | 项目源码 | 文件源码
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))
项目:WaiMaiOpinionMiner    作者:chaoming0625    | 项目源码 | 文件源码
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("??", 10000)
    jieba.suggest_freq(("?", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "?"))
项目:emotion_analyse_py    作者:jeffmxh    | 项目源码 | 文件源码
def __init__(self,n_core = 16):
        self.rootdir = os.getcwd()
        self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
        self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
        jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
        self.n_CORE=n_core
        jieba.enable_parallel(self.n_CORE-1)
项目:classifier-in-action    作者:shibing624    | 项目源码 | 文件源码
def __init__(self):
        self.__root_path = "data/dict/"
        jieba.load_userdict("data/dict/user.dict")  # ???????

        # ????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_path + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_path + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_path + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_path + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_path + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_path + "denial_dict.txt")
项目:T-SJTTR    作者:Wind-Ward    | 项目源码 | 文件源码
def read(self,file_name,timelength):

        #f = open("data/1993410.txt", "r")
        #timelength = 5640
        # f = open("data/5077534.txt", "r")
        # timelength = 4740
        f = open(file_name, "r")
        #timelength = 2582

        tempLine=[]
        #vocabulary=set()
        vocabulary = {}
        jieba.load_userdict("data/metadata/user_dict.txt")
        for lineNo,line in enumerate(f.readlines()):
            pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
            m=pattern.match(line)
            if m:
                temp={}
                temp={"time":int(float(m.group(1).split(',')[0])), \
                                   "text":[word  for word,flag in pseg.cut(m.group(2))  \
                                           if word not in self.stop_words and flag not in \
                                           ["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
                                   "lineno":lineNo+1}

                if len(temp["text"])>3:
                    tempLine.append(temp)
                    for item in temp["text"]:
                        if item not in vocabulary:
                            vocabulary[item]=0
        #print(len(tempLine))
        lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
        # print vocabulary
        # print  "vocabulary size: %d " % len(vocabulary)
        # print  "video comment size: %d " % len(lines)
        # print  lines[12]
        self.store(lines,timelength)
        return lines,timelength,vocabulary
项目:http_server    作者:chenguolin    | 项目源码 | 文件源码
def __init__(self, user_dict=None):
        """
        Init WordSegment Client

        @user_dict: user dict

        ????????????????????????????????
        """
        self.user_dict = user_dict
        if self.user_dict is not None:
            jieba.load_userdict(self.user_dict)
项目:entity_words_identification    作者:actank    | 项目源码 | 文件源码
def clean():
    jieba.load_userdict("../data/segmention/unigram.txt")
    output = open("./train.data", "w")
    with open("../data/prepare_data", "r") as f:
        for line in f:
            line = unicode(line.strip())

            #??????
            line = line.lower()

            #?????query
            if len(line) <= 2:
                continue
            #???????id?query
            if re.match('[0-9]{18}', line) != None:
                continue
            #???????query
            eng_flag = True
            for i in line:
                if i >= u'\u4e00' and i <= u'\u9fa5':
                    eng_flag = False
                    break
            if eng_flag == True:
                continue
            #????
            ll = jieba.cut(line)
            line = []
            for i in ll:
                if i == u"\u2006" or i == u" " or i == " ":
                    continue
                line.append(i)
            #??????????
            for i in range(len(line)):
                if synonym_dict.has_key(line[i]):
                    line[i] = synonym_dict[line[i]]

            #????query
            if line in s_list:
                continue
            l = ",".join(line)
            s_list.append(line)
            output.write(l + "\n")
    output.close()
    return
项目:RecommendSystem    作者:dhjack    | 项目源码 | 文件源码
def __init__(self, itemInfos):

        lastTime = time.time()
        # itemInfos : dict[(pid, description)]
        # train model
        jieba.load_userdict('./dict.txt.big.txt')
        stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
        stopWords.add('\n')
        stopWords.add(' ')
        stopWords.add(u'\u2022')
        stopWords.add(u'\xa9')
        texts = []
        self.name2id = {}
        self.id2name = []
        for k, v in itemInfos.iteritems():
            seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
            texts.append(list(seg_list))
            self.name2id[k] = len(self.id2name)
            self.id2name.append(k)

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1] for text in texts]

        print  "start cast :", (time.time() - lastTime)

        lastTime = time.time()
        dictionary = corpora.Dictionary(texts)
        print  "dictionary cast :", (time.time() - lastTime)

        lastTime = time.time()
        corpus = [dictionary.doc2bow(text) for text in texts]
        print  "doc2bow cast :", (time.time() - lastTime)

        lastTime = time.time()
        tfidf = models.TfidfModel(corpus)
        print  "tfid model cast :", (time.time() - lastTime)
        lastTime = time.time()

        lastTime = time.time()
        corpus_tfidf = tfidf[corpus]
        print  "tfidf corpus cast :", (time.time() - lastTime)

        lastTime = time.time()
        self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
        print  "lsi model cast :", (time.time() - lastTime)
        lastTime = time.time()

        #corpus_lsi = lsi[corpus_tfidf] 
        self.index = similarities.MatrixSimilarity(self.lsi[corpus]) 
        self.corpus = corpus

        self.pidName = getPidName()
        print "init finish"