Python keras.preprocessing.sequence 模块,pad_sequences() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用keras.preprocessing.sequence.pad_sequences()

项目:SNLI-Keras    作者:adamzjk    | 项目源码 | 文件源码
def prep_data(self):
    # 1, Read raw Training,Validation and Test data
    self.train,self.validation,self.test = self.load_data()

    # 2, Prep Word Indexer: assign each word a number
    self.indexer = Tokenizer(lower=False, filters='')
    self.indexer.fit_on_texts(self.train[0] + self.train[1]) # todo remove test
    self.Vocab = len(self.indexer.word_counts) + 1

    # 3, Convert each word in sent to num and zero pad
    def padding(x, MaxLen):
      return pad_sequences(sequences=self.indexer.texts_to_sequences(x), maxlen=MaxLen)
    def pad_data(x):
      return padding(x[0], self.SentMaxLen), padding(x[1], self.SentMaxLen), x[2]

    self.train = pad_data(self.train)
    self.validation = pad_data(self.validation)
    self.test = pad_data(self.test)
项目:hyperas    作者:maxpumperla    | 项目源码 | 文件源码
def data():
    maxlen = 100
    max_features = 20000

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    return X_train, X_test, y_train, y_test, max_features, maxlen
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def build_tensor(filename, numrecs, word2index, maxlen, 
                 make_categorical=False):
    data = np.empty((numrecs, ), dtype=list)
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().split():
            if word2index.has_key(word):
                wids.append(word2index[word])
            else:
                wids.append(word2index["UNK"])
        if make_categorical:
            data[i] = np_utils.to_categorical(
                wids, num_classes=len(word2index))
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def generate_batch(s_sents, s_word2index, t_sents, t_word2index, 
                   batch_size, maxlen):
    while True:
        # shuffle the input
        indices = np.random.permutation(np.arange(len(s_sents)))
        ss_sents = [s_sents[ix] for ix in indices]
        ts_sents = [t_sents[ix] for ix in indices]
        # convert to word indices
        si_sents = [[get_or_else(s_word2index, word, s_word2index["UNK"]) 
                    for word in sent] 
                    for sent in ss_sents]
        ti_sents = [[t_word2index[word] for word in sent]
                    for sent in ts_sents]
        # inner loop should run for an epoch
        num_batches = len(s_sents) // batch_size
        for i in range(num_batches):
            s_batch = si_sents[i * batch_size : (i + 1) * batch_size]
            t_batch = ti_sents[i * batch_size : (i + 1) * batch_size]
            sp_batch = sequence.pad_sequences(s_batch, maxlen=maxlen)
            tp_batch = sequence.pad_sequences(t_batch, maxlen=maxlen)
            tpc_batch = np_utils.to_categorical(tp_batch.reshape(-1, 1), 
                num_classes=len(t_word2index)).reshape(batch_size, 
                -1, len(t_word2index))
            yield sp_batch, tpc_batch
项目:SNLI-Keras    作者:adamzjk    | 项目源码 | 文件源码
def label_test_file(self):
    outfile = open("pred_vld.txt","w")
    prep_alfa = lambda X: pad_sequences(sequences=self.indexer.texts_to_sequences(X),
                                        maxlen=self.SentMaxLen)
    vld = json.loads(open('validation.json', 'r').read())
    for prem, hypo, label in zip(vld[0], vld[1], vld[2]):
      prem_pad, hypo_pad = prep_alfa([prem]), prep_alfa([hypo])
      ans = np.reshape(self.model.predict(x=[prem_pad, hypo_pad], batch_size = 1), -1)  # PREDICTION
      if np.argmax(ans) != label:
        outfile.write(prem + "\n" + hypo + "\n")
        outfile.write("Truth: " + self.rLabels[label] + "\n")
        outfile.write('Contradiction \t{:.1f}%\n'.format(float(ans[0]) * 100) +
                      'Neutral \t\t{:.1f}%\n'.format(float(ans[1]) * 100) +
                      'Entailment \t{:.1f}%\n'.format(float(ans[2]) * 100))
        outfile.write("-"*15 + "\n")
    outfile.close()
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# Note: Need to download and unzip Glove pre-train model files into same file as this script
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

# for validation dataset
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    '''Vectorize the words to their respective index and pad context to max context length and question to max question length.
       Answers vectors are padded to the max context length as well.
    '''
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in xrange(len(xContext)):
        x = [word_index[w] for w in xContext[i]]
        xq = [word_index[w] for w in xQuestion[i]]

        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')
项目:BiMPM_keras    作者:ijinmao    | 项目源码 | 文件源码
def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index
项目:BiMPM_keras    作者:ijinmao    | 项目源码 | 文件源码
def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs
项目:keras-image-captioning    作者:danieljl    | 项目源码 | 文件源码
def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
                                           padding='post')
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
                                                padding='post')
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output
项目:genre-erkennung-pipeline    作者:amirothman    | 项目源码 | 文件源码
def build_vectors(keyword="",data_label="",lower_limit=None,upper_limit=None,folder_path="dataset"):
    # training
    training_vector,labels,maxlen_training = create_dataset(dataset_path = folder_path+"/train",keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)

    # validation
    evaluation_training_vector,evaluation_labels,maxlen_evaluation = create_dataset(dataset_path = "{0}/test".format(folder_path),keyword=keyword,lower_limit=lower_limit,upper_limit=upper_limit)

    # # X_training
    training_vector = sequence.pad_sequences(training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
    pickle.dump(training_vector,open("pickled_vectors/{1}{0}_training_vector.pickle".format(keyword,data_label),"wb"))
    #
    # # y
    #
    pickle.dump(labels,open("pickled_vectors/{1}{0}_label.pickle".format(keyword,data_label),"wb"))
    #
    #
    # # evaluation
    evaluation_training_vector = sequence.pad_sequences(evaluation_training_vector, maxlen=np.max([maxlen_training,maxlen_evaluation]),dtype='float32')
    pickle.dump(evaluation_training_vector,open("pickled_vectors/{1}{0}_evaluation_training_vector.pickle".format(keyword,data_label),"wb"))
    #
    # # evaluation
    pickle.dump(evaluation_labels,open("pickled_vectors/{1}{0}_evaluation_label.pickle".format(keyword,data_label),"wb"))
    with(open("maxlen_{0}".format(keyword),"w")) as _f:
        _f.write(str(np.max([maxlen_training,maxlen_evaluation])))
项目:keras    作者:GeekLiB    | 项目源码 | 文件源码
def test_pad_sequences():
    a = [[1], [1, 2], [1, 2, 3]]

    # test padding
    b = pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
    b = pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])

    # test truncating
    b = pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[0, 1], [1, 2], [2, 3]])
    b = pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[0, 1], [1, 2], [1, 2]])

    # test value
    b = pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
项目:text-classification-with-convnets    作者:osmanbaskaya    | 项目源码 | 文件源码
def testset_read(fn, word_idx, maxlen):
    total_num_of_unk = 0
    tokenizer = TreebankWordTokenizer()
    try:
        lines = codecs.open(fn, encoding='utf8').read().splitlines()
    except UnicodeDecodeError:
        lines = codecs.open(fn).read().splitlines()
    X = []
    sentences = []
    for line in lines:
        s = []
        for token in tokenizer.tokenize(line):
            idx = word_idx.get(token, 1)  # 1 is UNKNOWN word id
            if idx == 1:
                total_num_of_unk += 1
            s.append(idx)
        X.append(s)
        sentences.append(line)

    X = sequence.pad_sequences(X, maxlen=maxlen)

    print >> sys.stderr, "Total number of UNK={}, Avg. {}".format(total_num_of_unk, total_num_of_unk / float(len(sentences)))
    return X, sentences
项目:keras-contrib    作者:farizrahman4u    | 项目源码 | 文件源码
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  # set to <unk> (index 1) if not in vocab

    y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    y_pos = pad_sequences(y_pos, maxlen, value=-1)  # lef padded with -1. Indeed, any interger works as it will be masked
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
    else:
        y_pos = numpy.expand_dims(y_pos, 2)
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_pos, y_chunk
项目:quora_duplicate    作者:ijinmao    | 项目源码 | 文件源码
def get_word_seq(train_ori1, train_ori2, test_ori1, test_ori2):
    # fit tokenizer
    tk = Tokenizer(num_words=TrainConfig.MAX_NB_WORDS)
    tk.fit_on_texts(train_ori1 + train_ori2 + test_ori1 + test_ori2)
    word_index = tk.word_index

    # q1, q2 training text sequence
    # (sentence_len, MAX_SEQUENCE_LENGTH)
    train_x1 = tk.texts_to_sequences(train_ori1)
    train_x1 = pad_sequences(train_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    train_x2 = tk.texts_to_sequences(train_ori2)
    train_x2 = pad_sequences(train_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    # q1, q2 testing text sequence
    test_x1 = tk.texts_to_sequences(test_ori1)
    test_x1 = pad_sequences(test_x1, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)
    test_x2 = tk.texts_to_sequences(test_ori2)
    test_x2 = pad_sequences(test_x2, maxlen=TrainConfig.MAX_SEQUENCE_LENGTH)

    np.save(open(DirConfig.Q1_CACHE_TRAIN, 'wb'), train_x1)
    np.save(open(DirConfig.Q2_CACHE_TRAIN, 'wb'), train_x2)
    np.save(open(DirConfig.Q1_CACHE_TEST, 'wb'), test_x1)
    np.save(open(DirConfig.Q2_CACHE_TEST, 'wb'), test_x2)
    np.save(open(DirConfig.WORD_INDEX_CACHE, 'wb'), word_index)
    return train_x1, train_x2, test_x1, test_x2, word_index
项目:quora_duplicate    作者:ijinmao    | 项目源码 | 文件源码
def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs
项目:attention-sum-reader    作者:cairoHy    | 项目源码 | 文件源码
def preprocess_input_sequences(self, data, shuffle=True):
        """
        ??????
        shuffle
        PAD/TRUNC????????
        y_true????self.A_len????index=0??????one-hot??
        """
        documents, questions, answer, candidates = self.union_shuffle(data) if shuffle else data
        d_lens = [len(i) for i in documents]

        questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
        documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
        context_mask = K.eval(tf.sequence_mask(d_lens, self.d_len, dtype=tf.float32))
        candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, context_mask, candidates_ok, y_true
项目:attention-sum-reader    作者:cairoHy    | 项目源码 | 文件源码
def preprocess_input_sequences(self, data, shuffle=True):
        """
        ??????
        shuffle
        PAD/TRUNC????????
        y_true????self.A_len????index=0??????one-hot??
        """
        documents, questions, answer, candidates = self.union_shuffle(data) if shuffle else data
        d_lens = [len(i) for i in documents]

        questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
        documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
        context_mask = K.eval(tf.sequence_mask(d_lens, self.d_len, dtype=tf.float32))
        candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, context_mask, candidates_ok, y_true
项目:nli_generation    作者:jstarc    | 项目源码 | 文件源码
def prepare_split_vec_dataset(dataset, word_index, padding = True, prem_len = None, hypo_len = None):
    P = []
    H = []
    y = []
    for example in dataset:
        if example[2] == '-':
            continue

        P.append(load_word_indices(example[0], word_index))   
        H.append(load_word_indices(example[1], word_index))
        y.append(LABEL_LIST.index(example[2]))

    one_hot_y = np.zeros((len(y), len(LABEL_LIST)))
    one_hot_y[np.arange(len(y)), y] = 1
    if pad_sequences:
        P = pad_sequences(P, prem_len, padding='pre')
        H = pad_sequences(H, hypo_len, padding='post')
    return np.array(P), np.array(H), one_hot_y
项目:Neural-Chatbot    作者:saurabhmathur96    | 项目源码 | 文件源码
def next_batch(self):
        inverse_vocabulary = self.inverse_vocabulary
        if self.stream:
            q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)]
            a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)]
        else:
            n_example = len(self.answers)
            indices = random.randint(0, n_example, size=(self.batch_size))
            q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices]
            a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices]

        X = pad_sequences(q, maxlen=self.sequence_length)
        y = pad_sequences(a, maxlen=self.sequence_length)

        if self.one_hot_target:
            return (X, self.to_one_hot(y))
        else:
            return (X, y)
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def generate_sentence_batch(sents, word2id, max_seqlen, batch_size):
    while True:
        # loop once per epoch
        # shuffle the input
        indices = np.random.permutation(np.arange(len(sents)))
        shuffled_sents = [sents[ix] for ix in indices]
        # convert to list of list of word id        
        sent_wids = [[word2id[word] for word in sent.split()]
                                    for sent in shuffled_sents]
        num_batches = len(shuffled_sents) // batch_size
        for bid in range(num_batches):
            # loop once per batch
            sents_batch = sent_wids[bid * batch_size : (bid + 1) * batch_size]
            sents_batch_padded = sequence.pad_sequences(sents_batch, max_seqlen)
            yield sents_batch_padded, sents_batch_padded            


############################ main ###############################
项目:keras_text_classifier    作者:cdj0311    | 项目源码 | 文件源码
def test(self, sentence, model, words):
        """
        test only a sentence
        :param sentence: a sentence, if ischar==False, the sentence should be segmented
        :param model: cnn model
        :param words: words list
        :return:
        """
        if self.ischar is True:
            sentence = list(sentence)
        else:
            sentence = sentence.split()
        x_test = [[words[w] for w in sentence if words.has_key(w)]]
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        pred_y = model.predict(x_test)
        return pred_y
项目:keras-customized    作者:ambrite    | 项目源码 | 文件源码
def test_pad_sequences():
    a = [[1], [1, 2], [1, 2, 3]]

    # test padding
    b = pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
    b = pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])

    # test truncating
    b = pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[0, 1], [1, 2], [2, 3]])
    b = pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[0, 1], [1, 2], [1, 2]])

    # test value
    b = pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
项目:ContextMF    作者:lzheng21    | 项目源码 | 文件源码
def train(self, X_train, V, seed):
        X_train = sequence.pad_sequences(X_train, maxlen=self.max_len)
        np.random.seed(seed)
        X_train = np.random.permutation(X_train)
        np.random.seed(seed)
        V = np.random.permutation(V)

        print("Train...CNN module")
        #history = self.model.fit({'input': X_train, 'output': V},
        #                         verbose=0, batch_size=self.batch_size, nb_epoch=self.nb_epoch, shuffle=True, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=0)])
        history = self.model.fit(X_train,y=V,batch_size=self.batch_size,nb_epoch=self.nb_epoch, shuffle=True, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=0)])

        cnn_loss_his = history.history['loss']
        cmp_cnn_loss = sorted(cnn_loss_his)[::-1]
        if cnn_loss_his != cmp_cnn_loss:
            self.nb_epoch = 1
        return history
项目:AI-Chatbot    作者:anujdutt9    | 项目源码 | 文件源码
def vectorize_ques(data, word_id, test_max_length, ques_max_length):
    X = []
    Xq = []
    for subtext, question in data:
        x = [word_id[w] for w in subtext]
        xq = [word_id[w] for w in question]
        # let's not forget that index 0 is reserved
        X.append(x)
        Xq.append(xq)
    return (pad_sequences(X, maxlen=test_max_length),
            pad_sequences(Xq, maxlen=ques_max_length))


# Vectorize the text
# Convert Subtext, Questions, Answers to Vector Form
# Y: array[] of zero's with "1" corresponding to word representing correct answer
项目:AI-Chatbot    作者:anujdutt9    | 项目源码 | 文件源码
def vectorize_text(data, word_id, text_max_length, ques_max_length):
    X = []
    Xq = []
    Y = []
    for subtext, question, answer in data:
        x = [word_id[w] for w in subtext]
        # Save the ID of Questions using SubText
        xq = [word_id[w] for w in question]
        # Save the answers for the Questions in "Y" as "1"
        y = np.zeros(len(word_id) + 1)
        y[word_id[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(X, maxlen=text_max_length),
            pad_sequences(Xq, maxlen=ques_max_length),
            np.array(Y))


# Read the text files
项目:rnn_agreement    作者:TalLinzen    | 项目源码 | 文件源码
def create_train_and_test(self, examples):
        d = [[], []]
        for i, s, dep in examples:
            d[i].append((i, s, dep))
        random.seed(1)
        random.shuffle(d[0])
        random.shuffle(d[1])
        if self.equalize_classes:
            l = min(len(d[0]), len(d[1]))
            examples = d[0][:l] + d[1][:l]
        else:
            examples = d[0] + d[1]
        random.shuffle(examples)

        Y, X, deps = zip(*examples)
        Y = np.asarray(Y)
        X = sequence.pad_sequences(X, maxlen=self.maxlen)
        n_train = int(self.prop_train * len(X))
        self.X_train, self.Y_train = X[:n_train], Y[:n_train]
        self.X_test, self.Y_test = X[n_train:], Y[n_train:]
        self.deps_train = deps[:n_train]
        self.deps_test = deps[n_train:]
项目:dsr16_nlp    作者:honnibal    | 项目源码 | 文件源码
def __init__(self, widths, vocab_size=5000):
        from keras.models import Sequential
        from keras.layers import Embedding, Dense, TimeDistributedMerge
        from keras.layers.advanced_activations import ELU
        from keras.preprocessing.sequence import pad_sequences
        from keras.optimizers import SGD
        self.n_classes = widths[-1]
        self.vocab_size = vocab_size
        self.word_to_int = {}
        self.int_to_word = np.ndarray(shape=(vocab_size+1,), dtype='int64')
        self.model = Sequential()
        self.model.add(Embedding(vocab_size, widths[0]))
        self.model.add(TimeDistributedMerge(mode='ave'))
        for width in widths[1:-1]:
            layer = Dense(output_dim=hidden_width, init='he_normal', activation=ELU(1.0))
            self.model.add(layer)
        self.model.add(
            Dense(
                n_classes,
                init='zero',
                activation='softmax'))
        sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        self.model.compile(loss='categorical_crossentropy', optimizer=sgd)
项目:mtl    作者:zhenhongChen    | 项目源码 | 文件源码
def subj_run(index_embedding, dataset, num_words=5000, embedding_len=100, max_len=50):

    (x_train, y_train), (x_test, y_test) = ds.load_data(dataset, num_words)
    x_train = sequence.pad_sequences(x_train, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)

    model = Sequential()
    model.add(Embedding(num_words, embedding_len, input_length=max_len, weights=[index_embedding]))
    model.add(LSTM(max_len, dropout=0.5, recurrent_dropout=0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])

    print(model.summary())
    model.fit(x_train, y_train, epochs=4, batch_size=50, verbose=2)
    score, acc = model.evaluate(x_test, y_test, verbose=0)

    print('Test score:', score)
    print('Test accuracy:', acc)
项目:mtl    作者:zhenhongChen    | 项目源码 | 文件源码
def imdb_run(index_embedding, dataset, num_words=5000, embedding_len=100, max_len=500):

    (x_train, y_train), (x_test, y_test) = ds.load_data(dataset, num_words)
    x_train = sequence.pad_sequences(x_train, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)

    model = Sequential()
    model.add(Embedding(num_words, embedding_len, input_length=max_len, weights=[index_embedding]))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])

    print(model.summary())
    model.fit(x_train, y_train, epochs=3, batch_size=64, verbose=2)
    score, acc = model.evaluate(x_test, y_test, verbose=0)

    print('Test score:', score)
    print('Test accuracy:', acc)
项目:NN_sentiment    作者:hx364    | 项目源码 | 文件源码
def fit(self, X_train, y_train, X_test, y_test,
            batch_size=100, nb_epoch=3, show_accuracy=True):
        """

        :param X_train: each instance is a list of word index
        :param y_train:
        :return:
        """
        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')
        print("Pad sequences (samples x time)")
        X_train = sequence.pad_sequences(X_train, maxlen=self.maxlen)
        X_test = sequence.pad_sequences(X_test, maxlen=self.maxlen)
        print('X_train shape:', X_train.shape)
        print('X_test shape:', X_test.shape)
        y_train = expand_label(y_train)
        y_test = expand_label(y_test)

        self.model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, validation_data=(X_test, y_test))
项目:NN_sentiment    作者:hx364    | 项目源码 | 文件源码
def fit(self, X_train, y_train, X_test, y_test,
            batch_size=50, nb_epoch=3):
        """

        :param X_train: each instance is a list of word index
        :param y_train:
        :return:
        """
        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')
        print("Pad sequences (samples x time)")
        X_train = sequence.pad_sequences(X_train, maxlen=self.maxlen)
        X_test = sequence.pad_sequences(X_test, maxlen=self.maxlen)
        print('X_train shape:', X_train.shape)
        print('X_test shape:', X_test.shape)
        y_train = expand_label(y_train)
        y_test = expand_label(y_test)

        #early stopping
        early_stop = EarlyStopping(monitor='val_loss', patience=2)

        self.model.fit({'input': X_train, 'output': y_train}, batch_size=batch_size, nb_epoch=nb_epoch,
          verbose=1, validation_data=({'input': X_test, 'output': y_test}), callbacks=[early_stop])
项目:NeuralNetwork-ImageQA    作者:ayushoriginal    | 项目源码 | 文件源码
def get_questions_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    questions = df[['question']].values.tolist()
    word_idx = ebd.load_idx()
    seq_list = []

    for question in questions:
        words = word_tokenize(question[0])
        seq = []
        for word in words:
            seq.append(word_idx.get(word,0))
        seq_list.append(seq)
    question_matrix = pad_sequences(seq_list)

    return question_matrix
项目:keras    作者:NVIDIA    | 项目源码 | 文件源码
def test_pad_sequences():
    a = [[1], [1, 2], [1, 2, 3]]

    # test padding
    b = pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
    b = pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])

    # test truncating
    b = pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[0, 1], [1, 2], [2, 3]])
    b = pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[0, 1], [1, 2], [1, 2]])

    # test value
    b = pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
项目:Question-Answering-NNs    作者:nbogdan    | 项目源码 | 文件源码
def loadTestData(folderName):
    data_train = pd.read_csv(folderName + 'data/test_datum.txt', sep='\t', error_bad_lines=False)
    labels = []
    for idx in range(data_train.question.shape[0]):
        labels.append(data_train.value[idx])
    texts_c3 = pickle.load(open(folderName + 'test_lemmas_c', 'rb'))
    texts_q3 = pickle.load(open(folderName + 'test_lemmas_q', 'rb'))
    texts_a3 = pickle.load(open(folderName + 'test_lemmas_a', 'rb'))
    tokenizer = pickle.load(open(folderName + 'structures/tokenizer', 'rb'))
    sequences_q = tokenizer.texts_to_sequences(texts_q3)
    sequences_a = tokenizer.texts_to_sequences(texts_a3)
    sequences_c = tokenizer.texts_to_sequences(texts_c3)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data_q = pad_sequences(sequences_q, maxlen=MAX_SEQUENCE_LENGTH_Q)
    data_a = pad_sequences(sequences_a, maxlen=MAX_SEQUENCE_LENGTH_A)
    data_c = pad_sequences(sequences_c, maxlen=MAX_SEQUENCE_LENGTH_C)

    labels = to_categorical(np.asarray(labels))
    print('Shape of label tensor:', labels.shape)

    return [data_c, data_q, data_a, labels, data_train]
项目:deeplearning_keras    作者:gazzola    | 项目源码 | 文件源码
def build_tensor(filename, numrecs, word2index, maxlen, 
                 make_categorical=False):
    data = np.empty((numrecs, ), dtype=list)
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().split():
            if word2index.has_key(word):
                wids.append(word2index[word])
            else:
                wids.append(word2index["UNK"])
        if make_categorical:
            data[i] = np_utils.to_categorical(
                wids, num_classes=len(word2index))
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata
项目:deeplearning_keras    作者:gazzola    | 项目源码 | 文件源码
def generate_batch(s_sents, s_word2index, t_sents, t_word2index, 
                   batch_size, maxlen):
    while True:
        # shuffle the input
        indices = np.random.permutation(np.arange(len(s_sents)))
        ss_sents = [s_sents[ix] for ix in indices]
        ts_sents = [t_sents[ix] for ix in indices]
        # convert to word indices
        si_sents = [[get_or_else(s_word2index, word, s_word2index["UNK"]) 
                    for word in sent] 
                    for sent in ss_sents]
        ti_sents = [[t_word2index[word] for word in sent]
                    for sent in ts_sents]
        # inner loop should run for an epoch
        num_batches = len(s_sents) // batch_size
        for i in range(num_batches):
            s_batch = si_sents[i * batch_size : (i + 1) * batch_size]
            t_batch = ti_sents[i * batch_size : (i + 1) * batch_size]
            sp_batch = sequence.pad_sequences(s_batch, maxlen=maxlen)
            tp_batch = sequence.pad_sequences(t_batch, maxlen=maxlen)
            tpc_batch = np_utils.to_categorical(tp_batch.reshape(-1, 1), 
                num_classes=len(t_word2index)).reshape(batch_size, 
                -1, len(t_word2index))
            yield sp_batch, tpc_batch