Python gensim.models 模块,Word2Vec() 实例源码

我们从Python开源项目中,提取了以下40个代码示例,用于说明如何使用gensim.models.Word2Vec()

项目:TextRankPlus    作者:zuoxiaolei    | 项目源码 | 文件源码
def run():
    '''
    ????
    '''
    reload(sys)
    sys.setdefaultencoding('utf8')
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp1 = r'wiki_model'
    outp2 = r'vector.txt'
    model = Word2Vec(sentences, size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

    testData = ['??','??','??','??']
    for i in testData:
        temp = model.most_similar(i)
        for j in temp:
            print '%f %s'%(j[1],j[0])
        print ''
项目:MyCluster    作者:yinminggang    | 项目源码 | 文件源码
def trainWord2Vector(sentence_count, vector_dimension, train_count):

    lines, model_out, vector_out = "sources/splited_words.txt", "result/word2vec.model", "result/pre_word2vec.vector"
    logging.info("??????")
    sentences = LineSentence(lines)
    # ??min_count=3??????3?? ????????????word2vec.vector?
    # workers????????????CPU??  ???3
    # sg?????????
    model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
                     min_count=0, workers=multiprocessing.cpu_count())
    # ?????  ??????
    for i in range(train_count):
        model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model.save(model_out)
    model.wv.save_word2vec_format(vector_out)
项目:MyCluster    作者:yinminggang    | 项目源码 | 文件源码
def trainWord2Vector(sentence_count, vector_dimension, train_count):

    lines, model_out, vector_out = "com/com/test1/test1sources/splited_words.txt", \
                                   "com/com/test1/test1sources/word2vec.model", \
                                   "com/com/test1/test1sources/word2vec.vector"
    logging.info("??????")
    sentences = LineSentence(lines)
    # ??min_count=3??????3?? ????????????word2vec.vector?
    # workers????????????CPU??  ???3
    model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
                     min_count=0, workers=multiprocessing.cpu_count())
    # ?????  ??????
    for i in range(train_count):
        model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model.save(model_out)
    model.wv.save_word2vec_format(vector_out)
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def uptrain(corpus,
            model_path=None,
            binary=True,
            lockf=0.0,
            min_count=1,
            size=300,
            **word2vec_params):
    wv = Word2Vec(min_count=min_count, size=size, **word2vec_params)
    print("Building vocabulary...")
    wv.build_vocab(corpus)
    print("Found %d distinct words." % len(wv.index2word))
    if model_path is not None:
        print("Intersecting with", model_path, "...")
        wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf)
        print("Intersected vectors locked with", lockf)

    total_examples = len(corpus)
    print("Training on %d documents..." % total_examples)
    wv.train(corpus, total_examples=total_examples)

    return wv
项目:dutchembeddings    作者:clips    | 项目源码 | 文件源码
def create(basedir, num_workers=12, size=320, threshold=5):
        """
        Creates a word2vec model using the Gensim word2vec implementation.

        :param basedir: the dir from which to get the documents.
        :param num_workers: the number of workers to use for training word2vec
        :param size: the size of the resulting vectors.
        :param threshold: the frequency threshold.
        :return: the model.
        """

        logging.basicConfig(level=logging.INFO)
        sentences = SentenceIter(root=basedir)

        model = Word2Vec(sentences=sentences,
                         sg=True,
                         size=size,
                         workers=num_workers,
                         min_count=threshold,
                         window=11,
                         negative=15)
        model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")

        return model
项目:PyTorchText    作者:chenyuntc    | 项目源码 | 文件源码
def train_save(self, list_csv):
        sentences = MySentences(list_csv)
        num_features = 256
        min_word_count = 1
        num_workers = 20
        context = 5
        epoch = 20
        sample = 1e-5
        model = Word2Vec(
            sentences,
            size=num_features,
            min_count=min_word_count,
            workers=num_workers,
            sample=sample,
            window=context,
            iter=epoch,
        )
        #model.save(model_fn)
        return model
项目:Word2VecStackoverflow    作者:ase-sharif    | 项目源码 | 文件源码
def main(positive, negative, topn):
    """ This method train word2vec model, and return most similar tags

    Args:
        positive (list): list of positive tags
        negative (list): list of negative tags
        topn (int): number of top keywords in word2vec

    Returns:
        list: Return list of word2vec

    """
    with open('tags.txt') as f:
        content = f.readlines()
        sentences = [x.split() for x in content]

        model = Word2Vec(sentences, min_count=20)

        return model.most_similar(positive=positive, negative=negative, topn=topn)
项目:ShallowLearn    作者:giacbrd    | 项目源码 | 文件源码
def fit_embeddings(self, documents):
        """
        Train word embeddings of the classification model, using the same parameter values for classification on Gensim ``Word2Vec``.
        Similar to use a pre-trained model.
        :param documents:
        """
        params = self.get_params()
        del params['pre_trained']
        del params['bucket']
        # Word2Vec has not softmax
        if params['loss'] == 'softmax':
            params['loss'] = 'hs'
        LabeledWord2Vec.init_loss(LabeledWord2Vec(), params, params['loss'])
        del params['loss']
        w2v = Word2Vec(sentences=documents, **params)
        self._classifier = LabeledWord2Vec.load_from(w2v)
项目:entity2rec    作者:D2KLab    | 项目源码 | 文件源码
def learn_embeddings(self, output):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        model.wv.save_word2vec_format(output, binary=True)

        print("saved model in word2vec binary format")

        return
项目:deeplearning    作者:fanfanfeng    | 项目源码 | 文件源码
def training_word2vec():
    sentences = []

    read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba)
    label_dir_list = os.listdir(read_dir_path)
    for label_dir in label_dir_list:
        label_dir_path = os.path.join(read_dir_path,label_dir)
        label_file_list = os.listdir(label_dir_path)
        for label_file in label_file_list:
            with open(os.path.join(label_dir_path,label_file),'rb') as reader:
                word_list = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip()
                sentences.append(word_list)

    model_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.word2Vect_path)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    model_save_path = os.path.join(model_path,sogou_classfication.model_name)

    model = Word2Vec(sentences,max_vocab_size=None,window=8,size=256,min_count=5,workers=4,iter=20)
    model.save(model_save_path)
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def load_save_word2vec_model(line_words, model_filename):
    # ????
    feature_size = 500
    content_window = 5
    freq_min_count = 3
    # threads_num = 4
    negative = 3   #best????hierarchical softmax??(??????????)????negative sampling??(??????)?
    iter = 20

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(model_filename):
        model = models.Word2Vec.load(model_filename)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        bigram_transformer = models.Phrases(line_words)
        model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc-tic))
        model.save(model_filename)
        # model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    return model
项目:entity2vec    作者:D2KLab    | 项目源码 | 文件源码
def learn_embeddings(self, output, output_format='binary'):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        is_binary = output_format != 'text'
        model.wv.save_word2vec_format(output, binary=is_binary)

        actual_format = 'text' if output_format == 'text' else 'binary'
        print("saved model in word2vec %s format" % actual_format)

        return
项目:spacy-dev-resources    作者:explosion    | 项目源码 | 文件源码
def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative
    )
    nlp = spacy.load(lang, parser=False, tagger=False, entity=False)
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            text = file_.read()
        total_sents += text.count('\n')
        doc = nlp(text)
        total_words += corpus.count_doc(doc)  
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for orth, freq in corpus.counts:
        if freq >= min_count:
            model.raw_vocab[nlp.vocab.strings[orth]] = freq
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)
    model.save(out_loc)
项目:blstm-cws    作者:chantera    | 项目源码 | 文件源码
def gen_embeddings(in_file, out_file, size=100):
    corpus = LineSentence(in_file)
    model = Word2Vec(
        sentences=corpus, size=size, alpha=0.025, window=5, min_count=5,
        max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
        sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
        trim_rule=None, sorted_vocab=1
    )
    model.save_word2vec_format(out_file, binary=False)
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def fit(self, tokens):
        # get most frequent items for plotting:
        tokens = [t.lower() for t in tokens]
        self.mfi = [t for t,_ in Counter(tokens).most_common(self.nb_mfi)]
        self.sentence_iterator = SentenceIterator(tokens=tokens)
        # train embeddings:
        self.w2v_model = Word2Vec(self.sentence_iterator,
                             window=self.window,
                             min_count=self.minimum_count,
                             size=self.size,
                             workers=self.nb_workers,
                             negative=self.nb_negative)
        self.plot_mfi()
        self.most_similar()

        # build an index of the train tokens
        # which occur at least min_count times:
        self.token_idx = {'<UNK>': 0}
        for k, v in Counter(tokens).items():
            if v >= self.minimum_count:
                self.token_idx[k] = len(self.token_idx)

        # create an ordered vocab:
        self.train_token_vocab = [k for k, v in sorted(self.token_idx.items(),\
                        key=itemgetter(1))]
        self.pretrained_embeddings = self.get_weights(self.train_token_vocab)

        return self
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def zhword2vec(ifname, fmodel):
    '''Training the word2vec word
    more: http://radimrehurek.com/gensim/models/word2vec.html

    '''
    model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
    model.save(fmodel)
    # model.save_word2vec_format(fword2vec, binary=False)
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def train_model(self, ofmodel, space = ' '):
        if self.traincorpusfname == None or not os.path.exists():
            ifname = self.__pretrain_model(space)
        else:
            ifname = self.traincorpusfname
        self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
        #
        # Calling Gensim 3rdparty lib, Training the word2vec word
        # more: http://radimrehurek.com/gensim/models/word2vec.html
        model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
        self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
        model.save(ofmodel)
        return (model, ofmodel)

# if __name__=='__main__':
#     if len(sys.argv) < 3:
#         print(globals()['__doc__'] %locals())
#         sys.exit(1)

#     inp, outp =sys.argv[1:3]
#     #inp = '../../data/zhwiki-latest-pages-articles.xml.bz2','r'
#     #outp = '../../model/word2vec.model'

#     wiki = tWikiCorpus(inp, _lemmatize=False, _dictionary={})
#     print 'wiki'
#     wiki.getTexts(outp, space=' ')
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def train_model(self, ofmodel, space = ' '):
        if self.traincorpusfname == None or not os.path.exists(self.traincorpusfname):
            ifname = self.pretrain_model(space)
        else:
            ifname = self.traincorpusfname
        self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
        #
        # Calling Gensim 3rdparty lib, Training the word2vec word
        # more: http://radimrehurek.com/gensim/models/word2vec.html
        model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
        self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
        model.save(ofmodel)
        return (model, ofmodel)
项目:DeepNews    作者:kabrapratik28    | 项目源码 | 文件源码
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False)
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def train_and_save(sents, output_file, options = {}):
  print "Training model..."
  model = Word2Vec(sents, **options)
  model.save(output_file)
项目:ShallowLearn    作者:giacbrd    | 项目源码 | 文件源码
def __init__(self, loss='softmax', bucket=0, **kwargs):
        """
        Exactly as the parent class `Word2Vec <https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec>`_.
        Some parameter values are overwritten (e.g. sg=0 because we never use skip-gram here), look at the code for details.
        Argument names must be explicit!

        `loss` = one value in {ns, hs, softmax}. If "ns" is selected negative sampling will be used
        as loss function, together with the parameter `negative`. With "hs" hierarchical softmax will be used,
        while with "softmax" (default) the sandard softmax function (the other two are "approximations").
         The `hs` argument does not exist anymore.

        `bucket` is the maximum number of hashed words, i.e., we limit the feature space to this number,
        ergo we use the hashing trick in the word vocabulary. Default to 0, NO hashing trick

        It basically builds two vocabularies, one for the sample words and one for the labels,
        so that the input layer is only made of words, while the output layer is only made of labels.
        **Parent class methods that are not overridden here are not tested and not safe to use**.
        """
        self.lvocab = {}  # Vocabulary of labels only
        self.index2label = []
        kwargs['sg'] = 0
        kwargs['window'] = sys.maxsize
        kwargs['sentences'] = None
        kwargs['hashfxn'] = custom_hash  # Force a consistent function across different Python versions
        self.softmax = self.init_loss(kwargs, loss)
        self.bucket = bucket
        super(LabeledWord2Vec, self).__init__(**kwargs)
项目:ShallowLearn    作者:giacbrd    | 项目源码 | 文件源码
def train(self, sentences, total_words=None, word_count=0,
              total_examples=None, queue_factor=2, report_delay=1.0):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.

        """
        if self.bucket > 0:
            sentences = HashIter(sentences, self.bucket, with_labels=True)
        if (self.model_trimmed_post_training):
            raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
                          "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []
            if self.negative > 0:
                # precompute negative labels optimization for pure-python training
                self.neg_labels = zeros(self.negative + 1)
                self.neg_labels[0] = 1.
        return super(LabeledWord2Vec, self).train(sentences, total_words, word_count,
                                                  total_examples, queue_factor, report_delay)
项目:ShallowLearn    作者:giacbrd    | 项目源码 | 文件源码
def load_from(cls, other_model):
        """
        Import data and parameter values from other model
        :param other_model: A ``LabeledWord2Vec`` object, or a ``Word2Vec`` or ``KeyedVectors`` object of Gensim
        """
        softmax = getattr(other_model, 'softmax', False)
        if softmax:
            loss = 'softmax'
        elif not other_model.hs and other_model.negative:
            loss = 'ns'
        else:
            loss = 'hs'
        new_model = LabeledWord2Vec(
            loss=loss,
            negative=other_model.negative if loss == 'ns' else 0,
            size=other_model.vector_size,
            seed=other_model.seed
        )
        new_model.reset_from(other_model)
        for attr in vars(other_model):
            if hasattr(new_model, attr):
                if not isinstance(other_model, LabeledWord2Vec) and (attr == 'syn1' or attr == 'syn1neg'):
                    continue
                value = getattr(other_model, attr, getattr(new_model, attr))
                if isinstance(value, KeyedVectors):
                    new_model.wv.syn0 = value.syn0
                    new_model.wv.syn0norm = value.syn0norm
                else:
                    setattr(new_model, attr, value)
        return new_model
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def load_w2v(corpus, dictionary):
    '''
    Return the trained Word2Vec model
    Train a model if model doesn't exist yet
    :param corpus:
    :param dictionary:
    :return:
    '''
    if not os.path.isfile(W2V_MODEL_PATH):
        num_features = 300    # Word vector dimensionality
        min_word_count = 5    # Minimum word count
        num_workers = 5       # Number of threads to run in parallel
        window = 5          # Context window size
        downsampling = 1e-5   # Downsample setting for frequent words
        print("Training the word2vec model!")
        sents = get_review_sentences()
        # Initialize and train the model (this will take some time)
        model = models.Word2Vec(sents, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = window, sample = downsampling)

        # If you don't plan to train the model any further, calling
        # init_sims will make the model much more memory-efficient.
        model.init_sims(replace=True)

        # It can be helpful to create a meaningful model name and
        # save the model for later use. You can load it later using Word2Vec.load()
        model.save(W2V_MODEL_PATH)
        tfidf = models.Word2Vec(corpus)
        print('Word2vec model created!')

    print('Loading word2vec model')
    w2v = models.Word2Vec.load(W2V_MODEL_PATH)
    print('Loading word2vec model complished!')
    return w2v
项目:Recommendation-based-on-sequence-    作者:Bereket123    | 项目源码 | 文件源码
def main():
    load_sequence('/home/beki/Documents/2nd Year/BD & DM Project/retail_dataset.csv')
    # split patterns to train_patterns and test_patterns

    train_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.8))
    test_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.2))

    # Word vector representation learning
    model = Word2Vec(train_patterns, size=15, window=3, min_count=1, workers=1, iter=3, sample=1e-4, negative=20)


    # Test
    test_size = float(len(test_patterns))
    hit = 0.0
    for current_pattern in test_patterns:
        if len(current_pattern) < 2:
            test_size -= 1.0
            continue
        # Reduce the current pattern in the test set by removing the last item
        last_item = current_pattern.pop()

        # Keep those items in the reduced current pattern, which are also in the models vocabulary
        items = [it for it in current_pattern if it in model.vocab]
        if len(items) <= 2:
            test_size -= 1.0
            continue

        # Predict the most similar items to items
        prediction = model.most_similar(positive=items)

        # Check if the item that we have removed from the test, last_item, is among
        # the predicted ones.
        for predicted_item, score in prediction:
            if predicted_item == last_item:
                hit += 1.0
                #print last_item
                #print prediction

    print 'Accuracy like measure: {}'.format(hit / test_size)
项目:struc2vec    作者:leoribeiro    | 项目源码 | 文件源码
def learn_embeddings():
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    logging.info("Initializing creation of the representations...")
    walks = LineSentence('random_walks.txt')
    model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
    model.wv.save_word2vec_format(args.output)
    logging.info("Representations created.")

    return
项目:deeplearning    作者:fanfanfeng    | 项目源码 | 文件源码
def make_word2vec():
    data_path = tv_classfication.tv_data_path
    sentence = data_work(data_path)
    model = Word2Vec(sentence,size=256,workers=4,window=10,iter=30)
    model.save(tv_classfication.word2vec_path)
项目:GraphEmbeddingsRecommenderSystems    作者:himangshunits    | 项目源码 | 文件源码
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)

    ####################################################################################################################################################################
    # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
    ####################################################################################################################################################################

    # call the build_deepwalk_corpus function
    # Take and populate the arguments from the command lines.                     
    generated_walks = graph.build_deepwalk_corpus(G = G, num_paths = args.number_walks, path_length = args.walk_length, alpha=0, rand=random.Random(0))
    # Call word2vec to build the model.
    # print generated_walks
    # The structure Looks like ['32173', '32168'], ['124010', '22676'], ['17792', '72925'],
    model = Word2Vec(generated_walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)


    ####################################################################################################################################################################
    # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
    ####################################################################################################################################################################

    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]

    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm
项目:wiki_zh_vec    作者:zhouhoo    | 项目源码 | 文件源码
def word2vec_train(input_file, output_file):
    sentences = word2vec.LineSentence(input_file)
    model = Word2Vec(sentences, size=300, min_count=10, sg=0, workers=multiprocessing.cpu_count())
    model.save(output_file)
    model.save_word2vec_format(output_file + '.vector', binary=True)
项目:DocumentClassification    作者:liu-nlper    | 项目源码 | 文件源码
def train():
    extract_sentece()

    in_path = './Data/corpus/sentence.txt'
    out_path = './Data/embedding/word2vec.bin'
    # ????
    model = Word2Vec(
        sg=1, sentences=LineSentence(in_path),
        size=256, window=5, min_count=3, workers=4, iter=40)
    model.wv.save_word2vec_format(out_path, binary=True)
项目:soy    作者:lovit    | 项目源码 | 文件源码
def train_word2vec(self, min_count=10, size=100, window=5, workers=3):
        self.word2vec_model = Word2Vec(Word2vecCorpus(self.corpus_file), min_count=min_count, size=size, window=window, workers=workers)
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def __init__(self, df, columns, model_param):
        self.df = df
        self.columns = columns
        self.model_param = model_param
        self.model = Word2Vec(sg=self.model_param["sg"], 
                                hs=self.model_param["hs"], 
                                alpha=self.model_param["alpha"],
                                min_alpha=self.model_param["alpha"],
                                min_count=self.model_param["min_count"], 
                                size=self.model_param["size"], 
                                sample=self.model_param["sample"], 
                                window=self.model_param["window"], 
                                workers=self.model_param["workers"])
项目:deep-hashtagprediction    作者:jderiu    | 项目源码 | 文件源码
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #unsupervised data
    hashtag_tweets = 'tweets/hashtag_tweets.gz'
    files = [hashtag_tweets]
    sentences = MySentences(files=files)
    model = models.Word2Vec(sentences, size=100, window=5, min_count=15, workers=8,sg=1,sample=1e-5,hs=1)
    model.save_word2vec_format('embeddings/hashtag_tweets_embedding',binary=False)
项目:noungroups    作者:gushecht    | 项目源码 | 文件源码
def main(in_dir, out_loc, task=1, size=128, window=5, min_count=10,
    n_workers=4, hs=1, nr_iter=5):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        sg=task,
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        hs=1,
        iter=nr_iter
    )
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            try:
                text = file_.read()
            except UnicodeDecodeError:
                print(text_loc)
        total_sents += text.count('\n')
        total_words += corpus.count_doc(text.split())  
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for key, string in corpus.strings.items():
        model.raw_vocab[string] = corpus.counts[key]
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)

    # Trims down model
    model.init_sims(replace=True)

    model.save(out_loc)
项目:QA    作者:KiddoZhu    | 项目源码 | 文件源码
def train(self, **kargs) :
        self.config.update(kargs)
        self.model = _Word2Vec(list(self.database.sentences), **self.config)
        delattr(self, "database")
项目:rna_protein_binding    作者:wentaozhu    | 项目源码 | 文件源码
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
    min_count = 5
    dim = 50
    window = 5

    print('dim: ' + str(dim) + ', window: ' + str(window))
    seq_dict = read_fasta_file(seq_file)

    #text = seq_dict.values()
    tris = get_6_trids()
    sentences = []
    for seq in seq_dict.values():
        seq = seq.replace('T', 'U')
        bag_sen = []
        bag_seqs = split_overlap_seq(seq)
        for new_seq in bag_seqs:
            trvec = get_4_nucleotide_composition(tris, new_seq)
            bag_sen.append(trvec)
        #for aa in range(len(text)):
        sentences.append(bag_sen)
    #pdb.set_trace()
    print(len(sentences))
    model = None
    docs = train_tag_doc(sentences)
    #model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
    #model =  gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
    model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
    model.build_vocab(docs)
    model.train(docs)
    '''vocab = list(model.vocab.keys())
    print vocab
    fw = open('rna_doc_dict', 'w')
    for val in vocab:
        fw.write(val + '\n')
    fw.close()
    #print model.syn0
    #pdb.set_trace()
    embeddingWeights = np.empty([len(vocab), dim])

    for i in range(len(vocab)):
        embeddingWeights[i,:] = model[vocab[i]]  

    allWeights.append(embeddingWeights)

    '''
    #model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
    #with open(outfile, 'w') as f:
    #    pickle.dump(model, f)
    # store the model to mmap-able files
    pdb.set_trace()
    model.save(outfile)
    # load the model back
    #model_loaded = Doc2Vec.load(outfile)
项目:CMU-MultimodalDataSDK    作者:A2Zadeh    | 项目源码 | 文件源码
def load_w2v(self):
        """
        Load Word2Vec embeddings from P2FA files and pre-trained Word2Vec 
        KeyedVectors text file and store them in the 
        directory path mentioned in self.embedding_dir.
        :returns segment wise feature dictionary for embeddings
        :Note: Do not provide KeyedVector file in binary format
        """
        from gensim.models.keyedvectors import KeyedVectors
        from gensim.models import Word2Vec

        is_binary = True if self.embed_model_type == "binary" else False
        model = KeyedVectors.load_word2vec_format(self.embed_model_path, 
                                                  binary = is_binary )
        print "Word2Vec model Loaded"
        self.embed_model = model
        self.embed_length = model.vector_size
        if not self.word_dict:
            self.load_words()

        features = {}
        system("mkdir -p "+self.embedding_dir)
        for video_id, video_word_data in self.word_dict.iteritems():
            video_feats = {}
            for segment_id, segment_word_data in video_word_data.iteritems():
                video_feats[segment_id] = []
                for word_feat in segment_word_data:
                    start, end, word = word_feat
                    try:
                        embed = self.embed_model[word]
                    except:
                        embed = np.zeros(self.embed_length)
                    video_feats[segment_id].append((start, end, embed))

                fname = video_id+"_"+segment_id+".csv"
                fpath = join(self.embedding_dir, fname)
                with open(fpath,"wb") as fh:
                    # Writing each feature in csv file for segment
                    for f in video_feats[segment_id]:
                        f_start = str(f[0])
                        f_end = str(f[1])
                        f_val = [str(val) for val in f[2].tolist()]
                        str2write = ",".join([f_start, f_end] + f_val)
                        str2write += "\n"
                        fh.write(str2write)
            features[video_id] = video_feats
        return features
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def makeFeature(df_features):
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get sentence vector')
    model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    # model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)
    # model = Word2Vec(brown.sents())
    df_features['vec1'] = df_features.q1_expand.map(lambda x: getVec(x, model))
    df_features['vec2'] = df_features.q2_expand.map(lambda x: getVec(x, model))

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get six kinds of coefficient about vector')
    df_features['f_cosine'] = df_features.apply(lambda x: Cosine(x['vec1'], x['vec2']), axis=1)
    df_features['f_manhatton'] = df_features.apply(lambda x: Manhatton(x['vec1'], x['vec2']), axis=1)
    df_features['f_euclidean'] = df_features.apply(lambda x: Euclidean(x['vec1'], x['vec2']), axis=1)
    df_features['f_pearson'] = df_features.apply(lambda x: PearsonSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_spearman'] = df_features.apply(lambda x: SpearmanSimilar(x['vec1'], x['vec2']), axis=1)
    df_features['f_kendall'] = df_features.apply(lambda x: KendallSimilar(x['vec1'], x['vec2']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get 3 kinds of coefficient about from w2c 2 document')
    df_features['f_cosine_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Cosine, model), axis=1)
    df_features['f_euclidean_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Euclidean, model), axis=1)
    df_features['f_manhatton_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Manhatton, model), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get three kinds of coefficient about nouns, verb, adj')
    df_features['f_raw_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1'], x['question2']), axis=1)
    df_features['f_raw_dice'] = df_features.apply(lambda x: Dice(x['question1'], x['question2']),axis=1)
    df_features['f_raw_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1'], x['question2']), axis=1)
    df_features['f_expand_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_expand_dice'] = df_features.apply(lambda x: Dice(x['q1_expand'], x['q2_expand']),axis=1)
    df_features['f_expand_ochiai'] = df_features.apply(lambda x: Ochiai(x['q1_expand'], x['q2_expand']), axis=1)
    df_features['f_nouns_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_nouns_dice'] = df_features.apply(lambda x: Dice(x['question1_nouns'], x['question2_nouns']),axis=1)
    df_features['f_nouns_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_nouns'], x['question2_nouns']), axis=1)
    df_features['f_verbs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_verbs_dice'] = df_features.apply(lambda x: Dice(x['question1_verbs'], x['question2_verbs']),axis=1)
    df_features['f_verbs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_verbs'], x['question2_verbs']), axis=1)
    df_features['f_adjs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_adjs'], x['question2_adjs']), axis=1)
    df_features['f_adjs_dice'] = df_features.apply(lambda x: Dice(x['question1_adjs'], x['question2_adjs']),axis=1)
    df_features['f_adjs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_adjs'], x['question2_adjs']), axis=1)

    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    print ('get weighted overlap about expand')
    weights = word_weights(df_features)
    df_features['f_weighted_overlap'] = df_features.apply(lambda x: weighted_Overlap(x['q1_expand'], x['q2_expand'], weights), axis=1)

    print('all done')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features.fillna(0.0)
    return df_features
项目:liveqa2017    作者:codekansas    | 项目源码 | 文件源码
def get_word_embeddings(num_dimensions=500,
                        cache_loc=EMBEDDINGS_FILE):
    """Generates word embeddings.

    Args:
        num_dimensions: int, number of embedding dimensions.
        cache_loc: str, where to cache the word embeddings.

    Returns:
        numpy array representing the embeddings, with shape (NUM_TOKENS,
            num_dimensions).
    """

    if os.path.exists(cache_loc):
        embeddings = np.load(cache_loc)
    else:
        class SentenceGenerator(object):
            def __iter__(self):
                iterable = itertools.islice(iterate_qa_pairs(), 1000000)
                for i, (question, answer) in enumerate(iterable, 1):
                    q, a, _, _ = tokenize(question=question, answer=answer,
                                          use_pad=False, include_rev=False)
                    yield [str(w) for w in q]
                    yield [str(w) for w in a]

                    del q, a, w

                    if i % 1000 == 0:
                        sys.stderr.write('\rprocessed %d' % i)
                        sys.stderr.flush()

                sys.stderr.write('\rprocessed %d\n' % i)
                sys.stderr.flush()

        # The default embeddings.
        embeddings = np.random.normal(size=(NUM_TOKENS, num_dimensions))

        sentences = SentenceGenerator()
        model = models.Word2Vec(sentences, size=num_dimensions)

        word_vectors = model.wv
        del model

        # Puts the Word2Vec weights into the right order.
        weights = word_vectors.syn0
        vocab = word_vectors.vocab
        for k, v in vocab.items():
            embeddings[int(k)] = weights[v.index]

        with open(cache_loc, 'wb') as f:
            np.save(f, embeddings)
            pass

    assert embeddings.shape == (NUM_TOKENS, num_dimensions)
    return embeddings
项目:diversity_based_attention    作者:PrekshaNema25    | 项目源码 | 文件源码
def get_global_embeddings(self, filenames, embedding_size, embedding_dir):
        """ Construct the Embedding Matrix for the sentences in filenames.

            Args:
                filenames: File names of the training files: Based on 
                which the vocab will be built. This is used when there
                are no pretrained embeddings present. Then instead of 
                using random embeddings, Word2Vec algorithm is used 
        to train the embeddings on the dataset avaliable.
                embedding_size: Dimensions for the embedding to be used.

            Returns
                Embedding matrix.
        """
        sentences = []

        if (os.path.exists(embedding_dir + 'vocab_len.pkl')):
                vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl"))
        else:
                vocab_len_stored = 0

        if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")):
                print ("Load file")
                self.embeddings = pickle.load(open(embedding_dir +  "embeddings.pkl"))
                return None

        if (os.path.exists(embedding_dir + 'embeddings') == True):
            model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False)
            print ("Loading pretriained embeddings")

        else:
            for file in filenames:
                with open(file, 'rb') as f:
                    for lines in f:
                        words = [lines.split()]
                        sentences.extend(words)

            model = Word2Vec(sentences, size=embedding_size, min_count=0)
            model.save(embedding_dir + 'embeddings')

        self.embeddings_model = model
        return model