Python gensim.models 模块，word2vec() 实例源码

我们从Python开源项目中，提取了以下8个代码示例，用于说明如何使用gensim.models.word2vec()。

项目：word2vec-keras-in-gensim 作者：niitsuma | 项目源码 | 文件源码

def train_batch_score_cbow_xy_generator(model, scored_word_sentences):
    for scored_word_sentence in scored_word_sentences:
        #print scored_word_sentence
        scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and  model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, scored_word in enumerate(scored_word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)]
            xy_gen=train_cbow_pair(model, scored_word[0] , word2_indices , None, None)
            for xy in xy_gen:
                if xy !=None:
                    xy1=[xy[0],xy[1],xy[2],[scored_word[1]]]
                    yield xy1

            # if xy !=None:
            #     xy1=[xy[0],xy[1],xy[2],scored_word[1]]
            #     yield xy1

项目：korean_restaurant_reservation 作者：JudeLee19 | 项目源码 | 文件源码

def __init__(self, fname='data/korean_word2vec', dim=300):
        self.dim = dim
        try:
            # load saved model
            print('Loading korean word2vec model')
            self.model = word2vec.Word2Vec.load(fname)
        except:
            print(':: There is no word2vec model')

项目：100knock2016 作者：tmu-nlp | 项目源码 | 文件源码

def extract_countries():
    countries_vec = {}
    vec = word2vec.Word2Vec.load("word2vec")
    for line in open("../chapter09/countries.txt", "r"):
        country = line.strip().replace(" ", "_")
        if country in vec.vocab.keys():
            countries_vec[country] = vec[country]

    return countries_vec

项目：dstc6_dialogue_breakdown_task 作者：JudeLee19 | 项目源码 | 文件源码

def __init__(self, file_name, dim=300):
        self.dim = dim
        try:
            print('Loading english word2vec model')
            self.word2vec_model = word2vec.Word2Vec.load(file_name)
        except:
            print('Error while loading word2vec model')

项目：YahooAnswer-LSTM 作者：rgtjf | 项目源码 | 文件源码

def load_embedding(data, embedding_file, binary=True, prefix=None, file_name='embedding.pkl'):
    """

    :param data:
    :param embedding_file:
    :param binary:
    :param prefix: if prefix is None, then write to file_name, else load from prefix
    :param file_name:
    :return:
    """
    if prefix == None:
        vocab = sorted(reduce(lambda x, y: x | y, (set(sentence) for sentence, _ in data)))
        word_idx = dict((c, i) for i, c in enumerate(vocab))
        vocab_size = len(word_idx) + 1  # +1 for nil word

        # "/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin"
        model = word2vec.Word2Vec.load_word2vec_format(embedding_file, binary=binary)

        embedding = []
        for c in word_idx:
            if c in model:
                embedding.append(model[c])
            else:
                embedding.append(np.random.uniform(0.1, 0.1, 300))
        embedding = np.array(embedding, dtype=np.float32)
        with open(file_name, 'wb') as f:
            pickle.dump(embedding, f)
            pickle.dump(vocab_size, f)
            pickle.dump(word_idx, f)
    else:
        with open(prefix, 'rb') as f:
            embedding = pickle.load(f)
            vocab_size = pickle.load(f)
            word_idx = pickle.load(f)

    return vocab_size, word_idx, embedding

项目：word2vec-keras-in-gensim 作者：niitsuma | 项目源码 | 文件源码

def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256):

    batch_count=0
    sub_batch_count=0
    train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_y  =np.zeros((batch_size,sub_batch_size),dtype='int8')

    while 1:
        for sentence in sentences:
            word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
            for pos, word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code

                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)
                #window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)])
                #print window_length,
                for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    # don't train on the `word` itself
                    if pos2 != pos:
                        xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index)
                        for xy in xy_gen :
                            if xy !=None:
                                (x0,x1,y)=xy
                                train_x0[batch_count][sub_batch_count]=x0
                                train_x1[batch_count][sub_batch_count]=x1
                                train_y[batch_count][sub_batch_count]=y
                                sub_batch_count += 1
                                if sub_batch_count >= sub_batch_size :
                                    batch_count += 1
                                    sub_batch_count=0
                                if batch_count >= batch_size :
                                    yield { 'index':train_x0, 'point':train_x1, 'code':train_y}
                                    batch_count=0

项目：word2vec-keras-in-gensim 作者：niitsuma | 项目源码 | 文件源码

def train_batch_cbow_xy_generator(model, sentences):
    for sentence in sentences:
        word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and  model.vocab[w].sample_int > model.random.rand() * 2**32]
        for pos, word in enumerate(word_vocabs):
            reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
            start = max(0, pos - model.window + reduced_window)
            window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
            word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
            xy_gen=train_cbow_pair(model, word , word2_indices , None, None)
            for xy in xy_gen:
                if xy !=None:
                    yield xy

项目：word2vec-keras-in-gensim 作者：niitsuma | 项目源码 | 文件源码

def train_batch_score_sg(model, scored_word_sentences,
                         score_vector_size,
                         alpha=None, work=None,
                         sub_batch_size=256,
                         batch_size=256):

    batch_count=0
    sub_batch_count=0
    train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
    train_y0  =np.zeros((batch_size,sub_batch_size),dtype='int8')
    train_y1  =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32')
    # train_x0=[[0]]*batch_size
    # train_x1=[[0]]*batch_size
    # train_y0=[[0]]*batch_size
    # train_y1=[[0]]*batch_size
    while 1:
        for scored_word_sentence in scored_word_sentences:
            #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence]

            word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and
                           model.vocab[w].sample_int > model.random.rand() * 2**32]
            for pos, scored_word in enumerate(word_vocabs):
                reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                word=scored_word2word(scored_word)
                # now go over all words from the (reduced) window, predicting each one in turn
                start = max(0, pos - model.window + reduced_window)
                for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                    word2=scored_word2word(scored_word2)
                    # don't train on the `word` itself
                    if pos2 != pos:
                        xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha)
                        for xy in xy_gen :
                            if xy !=None:
                                (x0,x1,y0)=xy
                                y1=scored_word2score(scored_word)
                                train_x0[batch_count][sub_batch_count]=x0
                                train_x1[batch_count][sub_batch_count]=x1
                                train_y0[batch_count][sub_batch_count]=y0
                                train_y1[batch_count][sub_batch_count]=y1
                                sub_batch_count += 1
                                if sub_batch_count >= sub_batch_size :
                                    batch_count += 1
                                    sub_batch_count=0
                                if batch_count >= batch_size :
                                    yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1}
                                    batch_count=0

                                # train_x0[batch_count]=[x0]
                                # train_x1[batch_count]=x1
                                # train_y0[batch_count]=y0
                                # train_y1[batch_count]=y1
                                # #print train_x0,train_y1,
                                # batch_count += 1
                                # if batch_count >= batch_size :
                                #     #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)}
                                #     yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
                                #     batch_count=0