Python nltk 模块,FreqDist() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.FreqDist()

项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def get_user_to_word_proportion(user_to_text, word):
    """
    Maps each user to the proportion of his words that consist of a specificied
    word.
    """
    user_to_word_proportion = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        n_tokens = len(lm.lowercase_tokens)
        if n_tokens > 0:
            fd = nltk.FreqDist(lm.lowercase_tokens)
            user_to_word_proportion[user] = fd[word] / float(n_tokens)
        else:
            user_to_word_proportion[user] = 0.0
        print 'Finished user {}'.format(user.encode('utf-8'))
    return user_to_word_proportion
项目:deep_throat    作者:wdbm    | 项目源码 | 文件源码
def most_frequent_Brown_Corpus_words():
    import nltk
    import nltk.corpus
    words = []
    for word in nltk.corpus.brown.words():
        if word not in [
            ",",
            ".",
            "``",
            "''",
            ";",
            "?",
            "--",
            ")",
            "(",
            ":",
            "!"
        ]:
            words.append(word.lower())
    frequencies_words = nltk.FreqDist(words).most_common()
    words_most_frequent = [word[0] for word in frequencies_words]
    return words_most_frequent
项目:resume-optimizer    作者:mhbuehler    | 项目源码 | 文件源码
def _calculate_word_scores(self, phrase_list):
        """Scores words according to frequency and tendency to appear in multi-word key phrases"""
        word_freq = nltk.FreqDist()
        word_multiplier = nltk.FreqDist()
        for phrase in phrase_list:
            # Give a higher score if word appears in multi-word candidates
            multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase)))
            for word in phrase:
                # Normalize by taking the stem
                word_freq[stem(word)] += 1
                word_multiplier[stem(word)] += multi_word
        for word in word_freq.keys():
            word_multiplier[word] = word_multiplier[word] / float(word_freq[word])  # Take average
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_freq[word] * word_multiplier[word]

        return word_scores
项目:Deep-Learning-with-Theano    作者:PacktPublishing    | 项目源码 | 文件源码
def build_dictionary(words, max_df=5):

    word_freq = [[unkown_token, -1], [pad_token, 0]]
    word_freq.extend(nltk.FreqDist(itertools.chain(words)).most_common())
    word_freq = OrderedDict(word_freq)
    word2idx = {unkown_token: 0, pad_token: 1}
    idx2word = {0: unkown_token, 1: pad_token}
    idx = 2
    for w in word_freq:
      f = word_freq[w]
      if f >= max_df:
        word2idx[w] = idx
        idx2word[idx] = w
        idx += 1
      else:
        word2idx[w] = 0 # map the rare word into the unkwon token
        word_freq[unkown_token] += 1 # increment the number of unknown tokens

    return word2idx, idx2word, word_freq
项目:PolBotCheck    作者:codeforfrankfurt    | 项目源码 | 文件源码
def get_corpus_of_most_active_users(n_users=5):
    tweets = []
    texts = []
    with open(DATASET_PATH) as f:
        for line in f:
            tweets.append(json.loads(line)['user']['screen_name'])
            texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))

    users = nltk.FreqDist(tweets).most_common(n_users)

    dict = {}
    for user, tweet in texts:
        if user in dict:
            dict[user] = " ".join([dict[user],tweet])
        else:
            dict[user] = tweet

    corpus = [dict[name] for name, _ in users]
    user_names = [name for name, _ in users]
    return  corpus, user_names
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:PyTrafficCar    作者:liyuming1978    | 项目源码 | 文件源码
def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data
项目:PyTrafficCar    作者:liyuming1978    | 项目源码 | 文件源码
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data
项目:PyTrafficCar    作者:liyuming1978    | 项目源码 | 文件源码
def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)]
项目:PyTrafficCar    作者:liyuming1978    | 项目源码 | 文件源码
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data
项目:poetic-inner-join    作者:emdaniels    | 项目源码 | 文件源码
def tokenize_sentences(self):
        # tokenize the sentences into words and count the word frequencies
        # get most common words, build index_to_word and word_to_index vectors
        self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
                                    self.sentences]
        word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
        print("Found %d unique word tokens." % len(word_freq.items()))

        vocab = word_freq.most_common(self.vocabulary_size - 1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict(
            [(w, i) for i, w in enumerate(self.index_to_word)])

        print("Using vocabulary size %d." % self.vocabulary_size)
        print(
            "The least frequent word is '%s' appearing %d times." % (
            vocab[-1][0], vocab[-1][1]))

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(self.tokenized_sentences):
            self.tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token for w in
                sent]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:ip6words    作者:lstn    | 项目源码 | 文件源码
def load_words(num_words):

    words = get_words_from_nltk()
    fdist = nltk.FreqDist(words)

    fdistmc = fdist.most_common()

    nd = OrderedDict()
    nda = []

    occurences = set([wt[1] for wt in fdistmc])
    occurences = sorted(occurences, key=int, reverse=True)

    for idx in occurences:
        nd[idx] = sorted([wt[0] for wt in fdistmc if wt[1] == idx])

    for key, val in nd.items():
        nda += val

    words = nda[:num_words]
    return words
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def statistics_by_aspect():
    filename = "aspects_train.csv"
    words_dist = nltk.ConditionalFreqDist()
    sample_sizes = nltk.FreqDist()

    samples_stream = get_samples_stream(filename)
    for aspect,words in samples_stream:
        sample_sizes[aspect] += 1
        for word in words:
            words_dist[aspect][word] += 1

    for category,dist in words_dist.iteritems():
        print "\n------- Category: {}".format(category)
        print dist.most_common(20)

    total_samples = sample_sizes.N()
    print "\ntotally {} samples".format(total_samples)
    for aspect, count in sample_sizes.iteritems():
        print "aspect[{}] has {} samples, {:.2f}%".format(aspect,count, count*100.0/total_samples)
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def save_topics(model,filename):
    with open(filename,"wt") as outf:
        # ---------- write each topic and words' contribution
        topics = model.show_topics(num_topics=-1, log=False, formatted=True)
        for topic in topics:
            # topic[0]: topic number
            # topic[1]: topic description
            outf.write("\n############# TOPIC {} #############\n".format(topic[0]))
            outf.write(topic[1]+"\n")

        # ---------- words statistics in all topics
        outf.write("\n\n\n****************** KEY WORDS ******************\n")
        topics = model.show_topics(num_topics=-1, log=False, formatted=False)
        keywords = (word for (_,words) in topics for (word,score) in words)

        fdist = nltk.FreqDist(keywords)
        for index,(w,c) in enumerate( fdist.most_common(100) ):
            outf.write("{}-th keyword: <{},{}>\n".format(index+1,w,c))
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:nlpSentiment    作者:ClimbsRocks    | 项目源码 | 文件源码
def createPopularWords(combined, lowerBound, upperBound):
    allWords = []
    for message in combined:
        for word in message[0]:
            allWords.append(word)

    allWords = nltk.FreqDist(allWords)


    # grab the top several thousand words, ignoring the lowerBound most popular
    # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
    # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
    popularWords = []
    wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
    for pair in wordsToUse:
        popularWords.append(pair[0])

    return popularWords


# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:GitHub-Recommender    作者:himangshunits    | 项目源码 | 文件源码
def get_word_counts(input_str, limit = 100):
        input_str = PreprocessManager.remove_non_ascii(input_str)
        wordnet_lemmatizer = WordNetLemmatizer()
        snowball_stemmer = EnglishStemmer()
        tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
        tokenized_text = [word for word in tokenized_text if len(word) > 1]  # Filter some small words
        #tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
        filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
        stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
        # Calculate frequency distribution
        frequency_dist = nltk.FreqDist(stemmed_list)

        # Output top 50 words
        result = dict()
        for word, frequency in frequency_dist.most_common(limit):
            # print(u'{};{}'.format(word, frequency))
            result[word] = frequency
        return result



    # This function just splits the words and gives the words that's all!
项目:AirbnbReviewAnalyzer    作者:mrsata    | 项目源码 | 文件源码
def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def plot_common_tokens(self, n_tokens):
        # Remove common stopwords
        fd = nltk.FreqDist(w for w in self.alpha_tokens if w not in s)
        fd.plot(n_tokens)
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def get_user_to_word_count(user_to_text, word):
    user_to_word_count = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        fd = nltk.FreqDist(lm.lowercase_tokens)
        user_to_word_count[user] = fd[word]
    return user_to_word_count
项目:NLP-JD    作者:ZexinYan    | 项目源码 | 文件源码
def sentence2vec(self, sentence):
        if len(self.features) == 0:
            self.load_feature_model()
        seg_list = jieba.cut(sentence, False)
        freq_dist = nltk.FreqDist(seg_list)
        local_list = []
        for each in self.features:
            local_list.append(freq_dist[each])
        return local_list
项目:NLP-JD    作者:ZexinYan    | 项目源码 | 文件源码
def get_freq_dist(self, seg_list):
        freq_dist = []
        for each in seg_list:
            freq_dist.append(nltk.FreqDist(each))
        return freq_dist
项目:vanilla-neural-nets    作者:cavaunpeu    | 项目源码 | 文件源码
def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size):
        word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) )
        word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()]
        word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True)
        most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \
            cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]]

        tokenized_corpus = [
            [word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\
            for sentence in tokenized_corpus
        ]
        return tokenized_corpus
项目:Deep-Learning-with-Theano    作者:PacktPublishing    | 项目源码 | 文件源码
def parse_text(filename, vocabulary_size=9000, type="word"):
    with open(filename, 'rb') as f:
        txt = f.read()
        if type == "word":
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            # sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
            tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
            word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
            print("Found %d unique words tokens." % len(word_freq.items()))
            vocab = word_freq.most_common(vocabulary_size-1)
            index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab]
            word_to_index = dict([(w,i) for i,w in enumerate(index)])
            print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
            for i, sent in enumerate(tokenized_sentences):
                tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
            X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences])
            y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences])
            # X_train, y_train = [], []
            # for sent in tokenized_sentences:
            #     l = len(sent) - 1
            #     X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray())
            #     y_train.append( [word_to_index[w] for w in sent[1:] ] )
        else:
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            index = ['^','$'] + list(set(txt))
            char_to_index = dict([(w,i) for i,w in enumerate(index)])
            X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent]  for sent in sentences])
            y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences])

    return X_train, y_train, index
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in para:
                counts['sents'] += 1

                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def scoreFunction(wholetext):
    """Get text, find most common words and compare with known
    stopwords. Return dictionary of values"""

    dictiolist = {}
    scorelist = {}
    # These are the available languages with stopwords from NLTK
    NLTKlanguages=["dutch","finnish","german","italian", "portuguese",
        "spanish","turkish","danish","english", "french","hungarian",
        "norwegian","russian","swedish"]

    FREElanguages = [""]
    languages=NLTKlanguages + FREElanguages

    # Fill the dictionary of languages, to avoid  unnecessary function calls
    for lang in NLTKlanguages:
        dictiolist[lang] = stopwords.words(lang)

    # Split all the text in tokens and convert to lowercase. In a
    # decent version of this, I'd also clean the unicode
    tokens = word_tokenize(wholetext)
    tokens = [t.lower() for t in tokens]

    # Determine the frequency distribution of words, looking for the
    # most common words
    freq_dist = FreqDist(tokens)

    # This is the only interesting piece, and not by much. Pick a
    # language, and check if each of the 20 most common words is in
    # the language stopwords. If it's there, add 1 to this language
    # for each word matched. So the maximal score is 20. Why 20? No
    # specific reason, looks like a good number of words.
    for lang in languages:
        scorelist[lang]=0
        for word in freq_dist.keys()[0:20]:
            if word in dictiolist[lang]:
                scorelist[lang]+=1
    return scorelist
项目:PolBotCheck    作者:codeforfrankfurt    | 项目源码 | 文件源码
def calc_frequencies(words, words_n=50, lang='german'):
    words = [word for word in words if len(word) > 1]
    words = [word for word in words if not word.isnumeric()]
    words = [word.lower() for word in words]
    # words = [word for word in words if word not in all_stopwords]
    # Stemming words seems to make matters worse, disabled
    # stemmer = nltk.stem.snowball.SnowballStemmer(lang)
    # words = [stemmer.stem(word) for word in words]

    fdist = nltk.FreqDist(words)
    return fdist.most_common(words_n)
项目:religion_project    作者:000384832    | 项目源码 | 文件源码
def occurencecount():

    # Ask the user to input a word
    word = raw_input("Enter a word : ")

    # Create a list of file which we will be looking into for matches
    fileList = ['Text1.txt', 'Text2.txt', 'Text3.txt', 'Text4.txt']

    # Open the files one by one, read them and find the occurance count inside each file
    for filename in fileList:

        # Openthe file
        fp_text = codecs.open(filename, 'r', 'utf-8')

        # Read all the words inside the file
        words_text = word_tokenize(fp_text.read())

        # Find the number of occurances of each word using built in method from NLTK
        fd_text = FreqDist(words_text)

        # Print out the number of occurances for that specific word
        print("Number of occurences in " + filename + " :  " + str(fd_text[word]))
项目:may142016    作者:ftrain    | 项目源码 | 文件源码
def get_words(tweets):
    """Given a set of tweets, return the most frequently-used words."""
    tweets = filter(lambda x: not(x.is_rt), tweets)
    tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text))
                 for t in tweets]
    words = [item for sublist in tokenized for item in sublist]
    longwords = filter(lambda x: len(x) > 6, words)
    lcwords = map(lambda x: x.lower(), longwords)
    fdist = nltk.FreqDist(lcwords)
    common = fdist.most_common(100)
    common = filter(lambda x: x[1] > 4, common)
    common = map(lambda x: [x[0], 6 + int(x[1]/3)], common)
    return common
项目:tweet_analyzer    作者:atandy    | 项目源码 | 文件源码
def make_data(file_name):
    '''Returns Tuple of dataframes used in analysis:
    core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df'''
    #realDonaldTrump_master_tweet_list.json

    #TODO: fix so strings aren't written to file and we can just load it as json.
    with open(file_name) as tfile:
        lines = tfile.readlines()
    raw_tweets_data =  [eval(t) for t in lines]

    analyzer = TextAnalyzer(raw_tweets_data)
    english_stopwords = stopwords.words("english")

    core_tweet_df = analyzer.make_tweet_df(
        with_pos_tags=False,
        columns_to_filter=['id', 'created_at', 'text', 'retweet_count', 'favorite_count'])

    # get list of tweets as text
    tweets_list = core_tweet_df.text.tolist()
    pos_df = analyzer.make_pos_df(tweets_list, make_csv=False)
    adj_df = pos_df[pos_df.pos_tag=='JJ']
    adj_df = analyzer.make_word_frequency_df(adj_df, 'word', make_csv=False)

    # calculate word frequencies among other words in data set. can't merge with pos
    # because certain words have many parts of speech. 
    word_frequency_df = analyzer.make_word_frequency_df(pos_df, 'word', make_csv=False)


    #Most common hashtags and total unique hashtags.
    all_hashtags = []
    for i in raw_tweets_data:
        all_hashtags.extend([d['text'] for d in i['entities']['hashtags']])
    fd = FreqDist(all_hashtags)

    hash_df = pd.DataFrame([{'hashtag':x,'abs_frequency': y, 'rel_frequency_pct': float(y)/len(all_hashtags)*100} for x,y in fd.most_common()])

    return core_tweet_df, tweets_list, pos_df, adj_df, word_frequency_df, hash_df
项目:GLaDOS2    作者:TheComet    | 项目源码 | 文件源码
def zipf(self, message, users):
        source_user = message.author.name
        source_user = source_user.strip('@').split('#')[0]

        target_users = [user.strip('@').split('#')[0] for user in users.split()]
        if len(users) == 0:
            target_users = [source_user]

        if users == '*':
            if message.server is not None:
                target_users = [member.name for member in message.server.members]
        target_users = [user for user in target_users if self.check_nickname_valid(user.lower()) is None]

        image_file_name = self.quotes_file_name(source_user.lower())[:-4] + '.png'
        pylab.title('Word frequencies')
        for user in target_users:
            quotes_file = codecs.open(self.quotes_file_name(user.lower()), 'r', encoding='utf-8')
            lines = quotes_file.readlines()
            quotes_file.close()

            if len(lines) < 20:
                continue

            tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
            tokens = self.filter_to_english_words(tokenizer.tokenize(str(lines)))
            if len(tokens) < 200:
                continue
            freq = nltk.FreqDist(tokens)
            self.plot_word_frequencies(freq, user)

        pylab.legend()
        pylab.savefig(image_file_name)
        pylab.gcf().clear()

        await self.client.send_file(message.channel, image_file_name)
项目:augmented_seq2seq    作者:suriyadeepan    | 项目源码 | 文件源码
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist
项目:augmented_seq2seq    作者:suriyadeepan    | 项目源码 | 文件源码
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    vocab = [ item for item in vocab if item[1] > 1 ]
    # index2word
    index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist
项目:PyTrafficCar    作者:liyuming1978    | 项目源码 | 文件源码
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)

        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
项目:codenn    作者:sriniiyer    | 项目源码 | 文件源码
def getFeat(self, line):
        listItem = [0]*self.noFeat
        fileFreqDist = nltk.FreqDist(SVM.tokenize(line))

        i = 0
        for key in self.trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i = i + 1
        return listItem
项目:adversarial-squad    作者:robinjia    | 项目源码 | 文件源码
def main():
  freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
  vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
  for w in vocab:
    print w
项目:DeepBot    作者:IgorWang    | 项目源码 | 文件源码
def take_some_analysis(file_dir):
    context_length = []
    utterance_length = []

    dist = nltk.FreqDist()

    for c, u in utterance_generator(file_dir):
        c_tokens = nltk.word_tokenize(c)
        u_tokens = nltk.word_tokenize(u)
        #  ????
        context_length.append(len(c_tokens))
        utterance_length.append(len(u_tokens))

        dist.update(c_tokens + u_tokens)

    cl_array = np.array(context_length)
    ul_array = np.array(utterance_length)

    print("most length of context is %d" % cl_array.max())
    print("most length of utterance is %d" % ul_array.max())
    print("mean length of context is %f" % cl_array.mean())
    print("mean length of utterance is %f" % ul_array.mean())

    sub_abs = np.abs(cl_array - ul_array)
    print("max,min,mean of abs(context_length -utterance_length) is %f,%f,%f" % (
        np.max(sub_abs), np.min(sub_abs), np.mean(sub_abs)))

    print("most common words :")
    print(dist.most_common(10))
项目:RNNPythonTutorial    作者:eublefar    | 项目源码 | 文件源码
def preprocess_data(self):
        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading CSV file..."
        with open('data/reddit-comments-2015-08.csv', 'rb') as f:
            reader = csv.reader(f, skipinitialspace=True)
            reader.next()
            # Split full comments into sentences
            sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
            # Append SENTENCE_START and SENTENCE_END
            sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
        print "Parsed %d sentences." % (len(sentences))

        # Tokenize the sentences into words
        tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

        # Count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        # Get the most common words and build index_to_word and word_to_index vectors
        vocab = word_freq.most_common(self.vocabulary_size-1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(unknown_token)
        self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])

        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

        # Replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in self.word_to_index else unknown_token for w in sent]

        print "\nExample sentence: '%s'" % sentences[0]
        print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

        # Create the training data
        #tokenized_words = [item for sublist in tokenized_sentences for item in sublist]
        #self.X_train = np.asarray([self.word_to_index[w] for w in tokenized_words[:-1]])
        #self.Y_train = np.asarray([self.word_to_index[w] for w in tokenized_words[1:]])
        self.X_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
        self.Y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
项目:truecaser    作者:nreimers    | 项目源码 | 文件源码
def checkSentenceSanity(sentence):
    """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
    caseDist = nltk.FreqDist()

    for token in sentence:
        caseDist[getCasing(token)] += 1

    if caseDist.most_common(1)[0][0] != 'allLower':        
        return False

    return True
项目:RottenCrawler    作者:kevin940726    | 项目源码 | 文件源码
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)

    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(), 
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) 
                     for text, freq in sorted_ngrams]

    return sorted_ngrams
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def sample_split(dbname,num_train,num_test):
    client = MongoClient()
    db = client[dbname]
    sentisent_collection = db.sentiment_sentences

    ################## load and count
    aspect_dist = nltk.FreqDist()
    sentiment_dist = nltk.FreqDist()

    all_samples = []
    cursor = sentisent_collection.aggregate([ { '$sample': { 'size': num_train  + num_test } } ])
    for index,d in enumerate(cursor):
        sent = Sentence.from_dict(d)
        all_samples.append( (sent.words,sent.sentiment) )

        aspect_dist[sent.aspect] +=1
        sentiment_dist[int(sent.sentiment)] +=1
    client.close()

    ################## show statistics
    for k in aspect_dist:
        print '[{}]: {}'.format(k,aspect_dist.freq(k))

    for k in sentiment_dist:
        print '[{}]: {}'.format(k,sentiment_dist.freq(k))

    ################## shuffle
    random.shuffle(all_samples)

    ################## split
    def __dump(filename,data):
        with open(filename,"wb") as outf:
            cPickle.dump(data,outf)

    __dump("sentidata_train_raw.pkl",all_samples[:num_train])
    __dump("sentidata_test_raw.pkl",all_samples[num_train:])