Python nltk.tokenize 模块,word_tokenize() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tokenize.word_tokenize()

项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def keyword_extractor(data):
    try:
        #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
        #result = np_extractor.extract()
        text = words_wo_stopwords(strip_tags(data))

        #TODO this is duplicated job, should be improved
        words = word_tokenize(strip_tags(text))
        taggged = pos_tag(words)
        cleaned = filter_insignificant(taggged)
        text = " ".join(cleaned)
        wc = WordCloudMod().generate(text)
        result = list(wc.keys())[:10]
    except Exception as err:
        print(colored.red("At keywords extraction {}".format(err)))
        result = []

    return result


# TODO definitely can be better if we knew where content is
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def create_batch(self, sentence_li):
        """Create a batch for a list of sentences."""

        embeddings_batch = []
        for sen in sentence_li:
            embeddings = []
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                embeddings.append(self.embdict.tok2emb.get(tok))
            if len(tokens) < self.max_sequence_length:
                pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
                embeddings = pads + embeddings
            else:
                embeddings = embeddings[-self.max_sequence_length:]
            embeddings = np.asarray(embeddings)
            embeddings_batch.append(embeddings)
        embeddings_batch = np.asarray(embeddings_batch)
        return embeddings_batch
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST
项目:stock-eagle    作者:mtusman    | 项目源码 | 文件源码
def similarity(c1, c2):
    '''stop words are words like "it" and "the" , that have no massive impact on the 
    sentence'''
    stop_words = list(stopwords.words("english"))
    # Removes stop words in both sentences
    c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
    c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
    c1_words = Counter(dedupe(c1_cleaned))
    c2_words = Counter(dedupe(c2_cleaned))
    total_words = c1_words + c2_words
    similarity_between_words = 0
    for key, val in total_words.items():
        ''' Looks at whether the two articles share a word'''
        if total_words[key] > 1:
            similarity_between_words += 1

    return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
项目:punctuator2    作者:ottokart    | 项目源码 | 文件源码
def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
            output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
            output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ")
项目:MachineComprehension    作者:sa-j    | 项目源码 | 文件源码
def check_sent(s):
    count = 0
    for r in s:
        #words = word_tokenize(r)
#        for w in words:
        for w in r:
            if type(w) != str:
                print(w)
                count += 1
                continue
            if w in inv_words or w in oov_words_in_train:
                continue
            if w not in word2vec:
                count += 1
                oov_words_in_train.add(w)
            else:
                inv_words[w] = word2vec.vocab[w].index
    return count
项目:vqa.pytorch    作者:Cadene    | 项目源码 | 文件源码
def preprocess_questions(examples, nlp='nltk'):
    if nlp == 'nltk':
        from nltk.tokenize import word_tokenize
    print('Example of generated tokens after preprocessing some questions:')
    for i, ex in enumerate(examples):
        s = ex['question']
        if nlp == 'nltk':
            ex['question_words'] = word_tokenize(str(s).lower())
        elif nlp == 'mcb':
            ex['question_words'] = tokenize_mcb(s)
        else:
            ex['question_words'] = tokenize(s)
        if i < 10:
            print(ex['question_words'])
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(examples), i*100.0/len(examples)) )
            sys.stdout.flush() 
    return examples
项目:delbot    作者:shaildeliwala    | 项目源码 | 文件源码
def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]
项目:Hotpot    作者:Liang-Qiu    | 项目源码 | 文件源码
def load_jacana(fname, regexen):
    samples = []
    with open(fname, 'rt') as inp:
        for line in inp:
            line = line.strip()
            if line.startswith('<Q> '):
                qorig = line[len('<Q> '):]
                q = word_tokenize(qorig)
            else:
                l = line.split(' ')
                label = int(l[0])
                kwweight = float(l[1])
                aboutkwweight = float(l[2])
                text = word_tokenize(' '.join(l[3:]))
                toklabels = regex_overlap(text, regexen[qorig])
                samples.append({'qtext': ' '.join(q), 'label': label,
                                'atext': ' '.join(text),
                                'kwweight': kwweight, 'aboutkwweight': aboutkwweight,
                                'toklabels': ' '.join([str(0+tl) for tl in toklabels])})
    return samples
项目:Hotpot    作者:Liang-Qiu    | 项目源码 | 文件源码
def load_sts(dsfile, skip_unlabeled=True):
    """ load a dataset in the sts tsv format """
    s0 = []
    s1 = []
    labels = []
    with codecs.open(dsfile, encoding='utf8') as f:
        for line in f:
            line = line.rstrip()
            label, s0x, s1x = line.split('\t')
            if label == '':
                if skip_unlabeled:
                    continue
                else:
                    labels.append(-1.)
            else:
                labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))
项目:Hotpot    作者:Liang-Qiu    | 项目源码 | 文件源码
def load_quora(dsfile):
    """ load a dataset in the quora csv format """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile, encoding = 'utf8') as csvfile:
        f = csv.reader(csvfile)
        firstline = True
        for line in f:
            if firstline:
                firstline = False
                continue
            s0x = line[3]
            s1x = line[4]
            label = line[5]
            labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))
项目:Hotpot    作者:Liang-Qiu    | 项目源码 | 文件源码
def load_sts(dsfile, skip_unlabeled=True):
    """ load a dataset in the sts tsv format """
    s0 = []
    s1 = []
    labels = []
    with codecs.open(dsfile, encoding='utf8') as f:
        for line in f:
            line = line.rstrip()
            label, s0x, s1x = line.split('\t')
            if label == '':
                if skip_unlabeled:
                    continue
                else:
                    labels.append(-1.)
            else:
                labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))
项目:Hotpot    作者:Liang-Qiu    | 项目源码 | 文件源码
def load_quora(dsfile):
    """ load a dataset in the quora csv format """
    s0 = []
    s1 = []
    labels = []
    with open(dsfile, encoding = 'utf8') as csvfile:
        f = csv.reader(csvfile)
        firstline = True
        for line in f:
            if firstline:
                firstline = False
                continue
            s0x = line[3]
            s1x = line[4]
            label = line[5]
            labels.append(float(label))
            s0.append(word_tokenize(s0x))
            s1.append(word_tokenize(s1x))
    return (s0, s1, np.array(labels))
项目:banking-class    作者:eli-goodfriend    | 项目源码 | 文件源码
def make_word_feature(df,embeddings):
    # use embeddings to vectorize merchant description
    # currently using averaging to combine words in merchant
    # there are other options: http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence
    merchants = df.merchant.tolist()
    veclen = len(embeddings['food'])
    word_feature = np.zeros((len(merchants),veclen))
    for idx, merchant in enumerate(merchants):
        num_known = 0
        try:
            words = tokenize.word_tokenize(merchant)
            words = [word.lower() for word in words]
            for word in words:
                wordvec = embeddings[word]
                word_feature[idx,:] += wordvec
                num_known += 1
        except:
            pass
        word_feature[idx,:] = word_feature[idx,:] / float(max(num_known,1))

    return word_feature
项目:review-classification    作者:vishnupriyam    | 项目源码 | 文件源码
def predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob):
    predicted_class = []
    for review in testSet:
        negative_probab = math.log10(PN)
        positive_probab = math.log10(PP)
        review_words = word_tokenize(review)
        for w in review_words:
            if w in negative_probabilities:
                negative_probab = negative_probab + math.log10(negative_probabilities[w])
            else:
                negative_probab = negative_probab + math.log10(unseen_neg_prob)
            if w in positive_probabilities:
                positive_probab = positive_probab + math.log10(positive_probabilities[w])
            else:
                positive_probab = positive_probab + math.log10(unseen_pos_prob)
        if(negative_probab > positive_probab):
            result = '-'
        else:
            result = '+'
        predicted_class.append(result)
    return predicted_class
项目:act-rte-inference    作者:DeNeutoy    | 项目源码 | 文件源码
def create_vocab(self,dataset_path, vocab_path ,max_vocab_size):

        print("generating vocab from dataset at {}".format(dataset_path))
        all_words = []
        for dataset in ["snli_1.0_train.jsonl","snli_1.0_dev.jsonl","snli_1.0_test.jsonl"]:
            for line in open(os.path.join(dataset_path, dataset),"r").readlines():
                data = json.loads(line)
                all_words += word_tokenize(data["sentence1"].lower())
                all_words += word_tokenize(data["sentence2"].lower())


        counter = Counter(all_words)
        count_pairs = sorted(counter.items(), key=lambda x : (-x[1], x[0]))

        words, _ = list(zip(*count_pairs))
        words = ["PAD"] + ["UNK"] + list(words)
        word_to_id = dict(zip(words[:max_vocab_size], range(max_vocab_size)))

        with open(vocab_path, "w") as file:
            for word, id in word_to_id.items():
                file.write("{}\t{}\n".format(word,id))

        print("vocab of size {} written to {}, with PAD token == 0, UNK token == 1".format(max_vocab_size,vocab_path))
项目:pyTextClassification    作者:tyiannak    | 项目源码 | 文件源码
def getFreqWords(directoryPath):    
    files = getListOfFilesInDir(directoryPath, "*")                # get list of files in directory
    allWords = []
    count = 0
    if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
        files = random.sample(files, MAX_FILES_PER_CLASS)        
    for ifile, fi in enumerate(files):                                          # for each file in current class:
        with open(fi) as f:
            content = f.read() 
            words = word_tokenize(content.decode('utf-8'))
            words = [w.lower() for w in words if w.lower() not in stop]                    
            words = list(set(words))
            allWords += words                
            count += 1
    #print allWords
    C = Counter(allWords)
    C = sorted(C.items(), key=itemgetter(1),reverse=True)        
    for c in C:
        if c[1] > 0.05 * float(count):
            print c[0], c[1] / float(count)
项目:mxnet-vqa    作者:shiyangdaisy23    | 项目源码 | 文件源码
def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs
项目:tensorflow-neural-networks    作者:vipul-sharma20    | 项目源码 | 文件源码
def create_lexicon(pos, neg):
    lexicon = []
    for fi in [pos, neg]:
        with open (fi, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l)
                lexicon += list(all_words)

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)

    """
    This is done in the tutorial.
    Seems like a brute force method of removing stopwords.
    TODO: Use NLTK stopwords to remove stop words ?
    """
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)

    return l2
项目:tensorflow-neural-networks    作者:vipul-sharma20    | 项目源码 | 文件源码
def sample_handling(sample, lexicon, classification):
    featureset = []

    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])

    return featureset
项目:VecShare    作者:JaredFern    | 项目源码 | 文件源码
def _avgrank_corp(inp_dir,hdv_vocab, num = 5000):
    cnt, vocab = Counter(), []
    # Counter for all words in the corpus
    for (root, dirs, files) in os.walk(inp_dir):
        files = [f for f in files if not f[0] == '.']
        for f in files:
            filepath = os.path.join(root,f)
            with codecs.open(filepath,'r', encoding="utf-8") as f:
                tok_txt = word_tokenize(f.read())
                for word in tok_txt: cnt[word] += 1
    for word in hdv_vocab:
        if word in cnt.keys(): del cnt[word]
    for word in cnt.most_common(num):
        try:    vocab.append(str(word[0]))
        except: continue
    return vocab
项目:VQA-tensorflow    作者:JamesChuanggg    | 项目源码 | 文件源码
def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 1000 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs
项目:kaggle-youtube-8m    作者:liufuyang    | 项目源码 | 文件源码
def create_lexicon(pos, neg):
    lexicon = []
    for fi in [pos, neg]:
        with io.open(fi, 'r', encoding='utf-8') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l.lower())
                lexicon += list(all_words)
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)

    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    return l2
项目:kaggle-youtube-8m    作者:liufuyang    | 项目源码 | 文件源码
def sample_handling(sample, lexicon, classification):
    featureset = []

    with io.open(sample, 'r', encoding='utf-8') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))

            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
    return featureset
项目:TextClassification    作者:AlgorTroy    | 项目源码 | 文件源码
def custom_tokenizer(sentence, delimiters=['|', ','], remove_puncs=True, get_unique=False):
    # tokens = re.split('(\W)', sentence)
    for delimiter in delimiters:
        sentence = re.sub(re.escape(delimiter), " "+delimiter+" ", sentence)

    tokens = word_tokenize(sentence)

    # Remove duplicates
    if get_unique:
        tokens = list(set(tokens))

    if remove_puncs:
        tokens = [token for token in tokens if
                  not ((len(token.strip()) == 1) and bool(re.search("[^a-zA-Z0-9]", token)))]

    tokens = [token for token in tokens if (not bool(re.search("\s", token)) and token != '')]

    # Remove duplicates
    if get_unique:
        tokens = list(set(tokens))

    return tokens
项目:semeval2017-scienceie    作者:UKPLab    | 项目源码 | 文件源码
def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens
项目:Dual-Attention-Network    作者:changywtw    | 项目源码 | 文件源码
def prepro_question(imgs, params):

    # preprocess all the question
    print 'example processed tokens:'
    for i,img in enumerate(imgs):
        s = img['question']
        if params['token_method'] == 'nltk':
            txt = word_tokenize(str(s).lower())
        else:
            txt = tokenize(s)
        img['processed_tokens'] = txt
        if i < 10: print txt
        if i % 100 == 0:
            sys.stdout.write("processing %d/%d (%.2f%% done)   \r" %  (i, len(imgs), i*100.0/len(imgs)) )
            sys.stdout.flush()   
    return imgs
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract_chunks(sent, chunkGram = r"""Chunk: {<JJ|NN.*>*<NNP>+<JJ|NN.*|IN>*<NN.*>}"""):
    try:
        tagged = pos_tag(word_tokenize(sent))
        #Maybe actually better if possessives aren't included.
        #At least one Proper Noun (NNP) should be included in the noun chunk. Also a single NNP is
        #probably not enough information to identify a data source
        chunkParser = RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        chunks = []
        for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
            chunk = ""
            for leave in subtree.leaves():
                chunk += leave[0] + ' '
            chunks.append(chunk.strip())
        return chunked, chunks
    except Exception as e:
        print(str(e))
项目:chatbot    作者:minggli    | 项目源码 | 文件源码
def train_model(documents, labels, sample_size=.3, verbose=True):

    if verbose:
        print('starting to generate training data...', end='', flush=True)

    labeled_feature_set = list()
    for n, doc in enumerate(documents):
        feature = word_tokenize(' '.join(doc))
        label = labels[n]
        resampled = resample(feature, label, sample_size)
        labeled_feature_set += resampled

    if verbose:
        print('done', flush=True)
        print('training model...this may take a few minutes.',
              flush=True, end='')

    trained_model = NaiveBayesClassifier.train(iter(labeled_feature_set))

    if verbose:
        print('done', flush=True)
    return trained_model
项目:NLTK_SentimentAnalysis_TensorFlow    作者:rachit-mishra    | 项目源码 | 文件源码
def create_lexicon(fin):
    lexicon = []
    with open(fin, 'r', buffering=100000, encoding ='latin-1') as f:
        try:
            counter = 1
            content = ''
            for line in f:
                counter+=1
                if(counter/2500.0).is_integer():
                    tweet=line.split(':::')[1]

                    content+= ' '+tweet
                    words = word_tokenize(content)
                    words = [lemmatizer.lemmatize(i) for i in words]
                    lexicon = list(set(lexicon + words))
                    print(counter, len(lexicon))
        except Exception as e:
            print(str(e))
    with open('lexicon.pickle', 'wb') as f:
        pickle.dump(lexicon, f)
项目:NLTK_SentimentAnalysis_TensorFlow    作者:rachit-mishra    | 项目源码 | 文件源码
def convert_to_vec(fin, fout, lexicon_pickle):
    with open(lexicon_pickle, 'rb') as f:
        lexicon = pickle.load(f)
    outfile = open(fout, 'a')
    with open(fin, buffering= 20000, encoding = 'latin-1') as f:
        counter = 0
        for line in f:
            counter +=1
            label = line.split(':::')[0]
            tweet = line.split(':::')[1]
            current_words = word_tokenize(tweet.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]

            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] +=1
            features = list(features)
            outline = str(features)+'::'+str(label)+ '\n'
            outfile.write(outline)
        print(counter)
项目:NLTK_SentimentAnalysis_TensorFlow    作者:rachit-mishra    | 项目源码 | 文件源码
def sample_handling(sample, lexicon, classification):
    featureset = []  # [1 0] pos sentiment [0 1] negative sentiment
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            #print(features)
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # like the example discussed earlier
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
            #print(featureset)

    return featureset
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def runprops_data(self, docs):
        new_docs = []
        for doc_name, doc in docs:
            print 'Processing:', doc_name
            doc_new = []
            doc = self.props_exception(doc_name, doc)

            for index, sent in enumerate(doc):
                doc_new.append(' '.join(word_tokenize(sent)))
                print index+1, doc_new[index]

            triples = []
            for i, sent in enumerate(doc_new):
                try:
                    tmp_triples = self.props_parser.extract_triples([sent])
                    triples.append(tmp_triples)
                except:
                    print('Error: failed for line %s' % (sent))
                    continue
            parse_sents = create_trees(triples, doc_new)
            sents = []
            new_docs.append((doc_name, parse_sents))
        return new_docs
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def wordMatch(question, line, storyPOS_dict):
    wordsInAQuestion = word_tokenize(question)
    rootsInAQuestion = set()
    for word in wordsInAQuestion:
        root = lancaster_stemmer.stem(word)
        rootsInAQuestion.add(root)

    if line in storyPOS_dict:
        verbmatch_score = 0
        rootmatch_score = 0
        scoreOfALine = {}
        for (word,tag) in storyPOS_dict[line]:
            if 'V' in tag:
                verb_root = lancaster_stemmer.stem(word)
                if verb_root in rootsInAQuestion:
                    verbmatch_score = verbmatch_score + 6
            else:
                word_root = lancaster_stemmer.stem(word)
                if word_root in rootsInAQuestion:
                    rootmatch_score = rootmatch_score + 3
        scoreOfALine[line] = rootmatch_score + verbmatch_score
        return rootmatch_score + verbmatch_score
项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def preprocess_sentence(sentence):
    """
    Preprocesses a sentence, turning it all to lowercase and tokenizing it into words.
    :param sentence: the sentence to pre-process.
    :return: the sentence, as a list of words, all in lowercase
    """
    sentence = sentence.lower()
    return word_tokenize(sentence)
项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def create_paper_dictionaries(filename="", readin=True, paper=None):
    """
    Creates the metadata data structures for a specific paper required to compute the extra features which are
    appended to the sentence vector.
    :param filename: the filename only, not the path, for the paper to create dictionaries for.
    :return: a tuple of the metadata data structures for the paper.
    """

    if readin and filename != "":
        # Read the paper in as a dictionary, keys are sections and values are the section text
        paper = read_in_paper(filename)

    # Extract paper keyphrases
    keyphrases = set(filter(None, " ".join(paper["KEYPHRASES"].lower().split("\n")).split(" ")))

    # Get the paper's vocab
    full_paper = " ".join([val for _, val in paper.iteritems()]).lower()
    paper_words = word_tokenize(full_paper)
    vocab = set(paper_words)

    # Create a bag of words for the paper
    paper_bag_of_words = defaultdict(int)
    for word in paper_words:
        paper_bag_of_words[word] += 1

    # Get the title words
    title_words = set([x.lower() for x in word_tokenize(paper["MAIN-TITLE"]) if x not in STOPWORDS])

    return keyphrases, vocab, paper_bag_of_words, title_words
项目:TAC-GAN    作者:dashayushman    | 项目源码 | 文件源码
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
项目:how_to_convert_text_to_images    作者:llSourcell    | 项目源码 | 文件源码
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
项目:chatterbot-weather    作者:gunthercox    | 项目源码 | 文件源码
def get_latitude(self, user_input):
        """
        Returns the latitude extracted from the input.
        """
        from nltk import tokenize

        for token in tokenize.word_tokenize(user_input):
            if 'latitude=' in token:
                return re.sub('latitude=', '', token)

        return ''
项目:chatterbot-weather    作者:gunthercox    | 项目源码 | 文件源码
def get_longitude(self, user_input):
        """
        Returns the longitude extracted from the input.
        """
        from nltk import tokenize

        for token in tokenize.word_tokenize(user_input):
            if 'longitude=' in token:
                return re.sub('longitude=', '', token)

        return ''
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def add_items(self, sentence_li):
        """Add new items to the tok2emb dictionary from a given text."""

        for sen in sentence_li:
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                if self.tok2emb.get(tok) is None:
                    self.tok2emb[tok] = self.fasttext_model[tok]
项目:Django-Basic-Sentiment    作者:enriksabalvaro    | 项目源码 | 文件源码
def sentiment(request):


    open_file = open("wordfeature5k.pickle","rb")
    word_features = pickle.load(open_file)
    open_file.close()


    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
        return features

    open_file = open("naivebayesclassifier.pickle","rb")
    classifier = pickle.load(open_file)
    open_file.close()

    sentence = request.POST['sentence']

    result = classifier.classify(find_features(sentence))

    if result == "positive":
        return render(request, "home/index.html",{"sentence":sentence, "positive":"positive"})
    elif result == "negative":
        return render(request, "home/index.html",{"sentence":sentence, "negative":"negative"})
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def word_seg_en(docs):
        docs = [word_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def word_seg_en(docs):
        docs = [word_tokenize(sent) for sent in tqdm(docs)]
        # show the progress of word segmentation with tqdm
        '''docs_seg = []
        print('docs size', len(docs))
        for i in tqdm(range(len(docs))):
            docs_seg.append(word_tokenize(docs[i]))'''
        return docs
项目:SentEval    作者:facebookresearch    | 项目源码 | 文件源码
def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        if tokenize:
            from nltk.tokenize import word_tokenize
        sentences = [s.split() if not tokenize else word_tokenize(s)
                     for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict['<s>'] = ''
        word_dict['</s>'] = ''
        return word_dict
项目:SentEval    作者:facebookresearch    | 项目源码 | 文件源码
def visualize(self, sent, tokenize=True):
        if tokenize:
            from nltk.tokenize import word_tokenize

        sent = sent.split() if not tokenize else word_tokenize(sent)
        sent = [['<s>'] + [word for word in sent if word in self.word_vec] +
                ['</s>']]

        if ' '.join(sent[0]) == '<s> </s>':
            import warnings
            warnings.warn('No words in "{0}" have glove vectors. \
                Replacing by "<s> </s>"..'.format(sent))
        batch = Variable(self.get_batch(sent), volatile=True)

        if self.use_cuda:
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0*n/np.sum(argmaxs) for n in argmaxs]

        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def extractRawFrequencies(self, article):
        # this method is similar to above but returns
        # the raw freq.cies ( all word count)
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
项目:allennlp    作者:allenai    | 项目源码 | 文件源码
def split_words(self, sentence: str) -> List[Token]:
        # Import is here because it's slow, and by default unnecessary.
        from nltk.tokenize import word_tokenize
        return [Token(t) for t in word_tokenize(sentence.lower())]