Python nltk.corpus.stopwords 模块,words() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.corpus.stopwords.words()

项目:customer-service-chatbot    作者:xploiter-projects    | 项目源码 | 文件源码
def SpeechToText():
        r = sr.Recognizer()   #Speech recognition
        with sr.Microphone() as source:
            print("Say something!")
            audio = r.listen(source)
            message = r.recognize_google(audio)
            print("Check: "+message)
        try:
            print("User: " + r.recognize_google(audio))
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
        return message

#function to find importance of words to use them to deduce that which thing is being asked more
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text
项目:ml-projects    作者:saopayne    | 项目源码 | 文件源码
def collection_stats():
    # list of documents
    documents_stat = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
    print(str(len(train_docs_stat)) + " total training documents")

    test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
    print(str(len(test_docs_stat) + " total test documents"))

    # list of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # get the documents in a category
    category_docs = reuters.fileids("acq")

    # words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # print the raw document
    print(reuters.raw(document_id))
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def get_user_to_word_proportion(user_to_text, word):
    """
    Maps each user to the proportion of his words that consist of a specificied
    word.
    """
    user_to_word_proportion = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        n_tokens = len(lm.lowercase_tokens)
        if n_tokens > 0:
            fd = nltk.FreqDist(lm.lowercase_tokens)
            user_to_word_proportion[user] = fd[word] / float(n_tokens)
        else:
            user_to_word_proportion[user] = 0.0
        print 'Finished user {}'.format(user.encode('utf-8'))
    return user_to_word_proportion
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def generate(cfd, start_word, n):
        word = start_word
        words = []
        for i in range(n):
            words.append(word)
            # word = cfd[word].max()
            fd = cfd[word]
            n_next_words = sum(fd.values())
            if n_next_words > 0:
                probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())]
                word = choice(sorted(fd.keys()), p=probabilities)
            else:
                # Pick random word
                old_word = word
                # TODO: use unigram probabilities later
                word = choice(cfd.keys())
        words.append(word)
        sentence = ' '.join(words)
        # TODO: modify above for punctuation
        return sentence
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def rm_stop_words(data, mode="nltk",silent=1):
    """
    Input:
        data is a set, {} or Counter
    """
    if silent==0:
        print("remove stop words ...")
    if mode == "nltk":
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
    else:
        print("unknown mode",mode)
        assert 0

    if isinstance(data,list):   
        data = [i for i in data if i.lower() not in stop_words]
        return data
    else:
        for word in stop_words:
            if word in data:
                del data[word]
项目:BiMPM_keras    作者:ijinmao    | 项目源码 | 文件源码
def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
    """
    threshold only take effects when topn is None.
    words are indexed by overall frequency in the dataset.
    """
    word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
    if topn:
        word_freq = zip(*word_freq[:topn])[0]
        vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
    else:
        idx = start_idx
        vocab_dict = {}
        for word, freq in word_freq:
            if freq < threshold:
                return vocab_dict
            vocab_dict[word] = idx
            idx += 1
    return vocab_dict
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def bigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of bigram, e.g., ["I_am", "am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for k in range(1, skip + 2):
                    if i + k < L:
                        lst.append(join_string.join([words[i], words[i + k]]))
        else:
            # set it as unigram
            lst = NgramUtil.unigrams(words)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def trigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of trigram, e.g., ["I_am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in range(L - 2):
                for k1 in range(1, skip + 2):
                    for k2 in range(1, skip + 2):
                        if i + k1 < L and i + k1 + k2 < L:
                            lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
        else:
            # set it as bigram
            lst = NgramUtil.bigrams(words, join_string, skip)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def biterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for j in range(i + 1, L):
                    lst.append(join_string.join([words[i], words[j]]))
        else:
            # set it as uniterm
            lst = NgramUtil.uniterms(words)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def triterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in xrange(L - 2):
                for j in xrange(i + 1, L - 1):
                    for k in xrange(j + 1, L):
                        lst.append(join_string.join([words[i], words[j], words[k]]))
        else:
            # set it as biterm
            lst = NgramUtil.biterms(words, join_string)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def fourterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
            Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
        """
        assert type(words) == list
        L = len(words)
        if L > 3:
            lst = []
            for i in xrange(L - 3):
                for j in xrange(i + 1, L - 2):
                    for k in xrange(j + 1, L - 1):
                        for l in xrange(k + 1, L):
                            lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
        else:
            # set it as triterm
            lst = NgramUtil.triterms(words, join_string)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def ngrams(words, ngram, join_string=" "):
        """
        wrapper for ngram
        """
        if ngram == 1:
            return NgramUtil.unigrams(words)
        elif ngram == 2:
            return NgramUtil.bigrams(words, join_string)
        elif ngram == 3:
            return NgramUtil.trigrams(words, join_string)
        elif ngram == 4:
            return NgramUtil.fourgrams(words, join_string)
        elif ngram == 12:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            return unigram + bigram
        elif ngram == 123:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
            return unigram + bigram + trigram
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def bigrams(words, join_string, skip=0):
        """
           Input: a list of words, e.g., ["I", "am", "Denny"]
           Output: a list of bigram, e.g., ["I_am", "am_Denny"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for k in range(1, skip + 2):
                    if i + k < L:
                        lst.append(join_string.join([words[i], words[i + k]]))
        else:
            # set it as unigram
            lst = NgramUtil.unigrams(words)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def biterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 1:
            lst = []
            for i in range(L - 1):
                for j in range(i + 1, L):
                    lst.append(join_string.join([words[i], words[j]]))
        else:
            # set it as uniterm
            lst = NgramUtil.uniterms(words)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def triterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
            Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
        """
        assert type(words) == list
        L = len(words)
        if L > 2:
            lst = []
            for i in xrange(L - 2):
                for j in xrange(i + 1, L - 1):
                    for k in xrange(j + 1, L):
                        lst.append(join_string.join([words[i], words[j], words[k]]))
        else:
            # set it as biterm
            lst = NgramUtil.biterms(words, join_string)
        return lst
项目:MatchZoo    作者:faneshion    | 项目源码 | 文件源码
def fourterms(words, join_string):
        """
            Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
            Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
        """
        assert type(words) == list
        L = len(words)
        if L > 3:
            lst = []
            for i in xrange(L - 3):
                for j in xrange(i + 1, L - 2):
                    for k in xrange(j + 1, L - 1):
                        for l in xrange(k + 1, L):
                            lst.append(join_string.join([words[i], words[j], words[k], words[l]]))
        else:
            # set it as triterm
            lst = NgramUtil.triterms(words, join_string)
        return lst
项目:identifiera-sarkasm    作者:risnejunior    | 项目源码 | 文件源码
def build_vocabulary( words, max_size ):
    vocab_instances = 0
    unique_counts = Counter(words)
    d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
    vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

    # start at 2 to leave room for padding & unknown
    pb = Progress_bar(len(d) - 1) 
    for i, (key, value) in enumerate(vocabulary.items(), start=2):      
        vocab_instances += value
        vocabulary[key] = i
        pb.tick()

    vocabulary[cfg.padding_char] = 0
    vocabulary[cfg.placeholder_char] = 1
    #reverse the vocbulary (for reverse lookup)
    rev_vocabulary = {v: k for k, v in vocabulary.items()}  
    vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

    return vocab
项目:identifiera-sarkasm    作者:risnejunior    | 项目源码 | 文件源码
def tokenize_text( sample_text ):
    global sequence_lengths
    processed_text = []

    if cfg.remove_punctuation:
        cleaned = sample_text.lower().translate( t_table )
    else:
        cleaned = sample_text

    if cfg.use_casual_tokenizer:
        tokens = tknzr.tokenize( cleaned )
    else:
        tokens = nltk.word_tokenize( cleaned, language='english')

    if cfg.remove_stopwords:
        tokens = [w for w in tokens if not w in stopwords.words('english')]

    sequence_lengths.append( len( tokens ) )
    processed_text.extend( tokens )

    return processed_text
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def _init_(self, min_cut=0.1, max_cut=0.9):
        # identation changes - we are inside the constructor
        # here we set up the behaviour
        # this is called each time an object of feq summ class is
        # created or instantiated
        self._min_cut = min_cut    # self=keyword that reports the variable
        self._max_cut = max_cut
        # we save the val of the 2 parameters passed by assigning them
        # two member variables - the 'self.' prefix identifies them as part
        # of the self argument - using underscore as first char.
        self._stopwords = set(stopwords.words('english') + list(punctuation))
        # this is alist of all common words and punc symols

    # identation changes - we are out of the constructor here
    # This is still the body of the class
    # Defining var here ( outside a member function) but within the class
    # member var becomes STATIC. This means it belongs to the class, and not
    # to any specific individual instance (object) of the class
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)
项目:stock-eagle    作者:mtusman    | 项目源码 | 文件源码
def similarity(c1, c2):
    '''stop words are words like "it" and "the" , that have no massive impact on the 
    sentence'''
    stop_words = list(stopwords.words("english"))
    # Removes stop words in both sentences
    c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
    c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
    c1_words = Counter(dedupe(c1_cleaned))
    c2_words = Counter(dedupe(c2_cleaned))
    total_words = c1_words + c2_words
    similarity_between_words = 0
    for key, val in total_words.items():
        ''' Looks at whether the two articles share a word'''
        if total_words[key] > 1:
            similarity_between_words += 1

    return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _answer_stop_word_density(self, row):
        """Percentage of tokens in the answer are stopwords
        - Args:
            row(pandas.dataframe): input row vector
        - Returns:
            row(pandas.dataframe): ouput vector with new feature 
        """
        stop = stopwords.words('english')
        answer = row.Answer
        if answer:
            tokens = answer.split()
            num_tokens = len(tokens)
            stop_word_in_answer = [i for i in tokens if i in stop]
            num_stop_word_in_answer = len(stop_word_in_answer)
            row['ANSWER_STOPWORD_DENSITY'] = float(
                num_stop_word_in_answer) / num_tokens
            return row
        else:
            row['ANSWER_STOPWORD_DENSITY'] = 0
            return row
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _answer_quantifier_density(self, row):
        """Percentage of tokens in the answer that are quantifier words
        - Args:
            row(pandas.dataframe): input pandas dataframe
        - Returns:
            row(pandas.dataframe): result a pandas dataframe with new feature
        """
        answer = row.Answer
        if answer:
            tokens = answer.split()
            answer_len = len(tokens)
            quantifier_tokens = [
                i for i in tokens if i in ling.QUANTIFIER_WORDS]
            quantifier_tokens_len = len(quantifier_tokens)
            row['ANSWER_QUANTIFIER_DENSITY'] = float(
                quantifier_tokens_len) / answer_len
            return row
        else:
            row['ANSWER_QUANTIFIER_DENSITY'] = 0
            return row
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _percentage_capitalized_word_in_answer(self, row):
        """Percentage of capitalized words in the sentence that are in the answer
        - Args:
            row(pandas.dataframe): input pandas dataframe
        - Returns:
            row(pandas.dataframe): result a pandas dataframe with new feature
        """
        answer = row.Answer
        sentence = row.Sentence
        if answer is not None and sentence is not None:
            tokens = sentence.split()
            num_tokens = len(tokens)
            cap_tokens = [i for i in tokens if i.isupper() == True]
            cap_tokens_in_answer = [i for i in cap_tokens if i in answer]
            row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = float(
                len(cap_tokens_in_answer)) / num_tokens
            return row
        else:
            row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = 0
            return row
项目:MP-CNN-Variants    作者:tuzhucheng    | 项目源码 | 文件源码
def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt):
    """
    Get overlap, idf weighted overlap, overlap excluding stopwords, and idf weighted overlap excluding stopwords.
    """
    stoplist = set(stopwords.words('english'))
    num_docs = len(sent_list_1)
    overlap_feats = []

    for s1, s2 in zip(sent_list_1, sent_list_2):
        tokens_a_set, tokens_b_set = set(s1), set(s2)
        intersect = tokens_a_set & tokens_b_set
        overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set))
        idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect)
        idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set))

        tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist)
        tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist)
        intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop
        overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
        idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop)
        idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
        overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop])

    return overlap_feats
项目:LDA-REST    作者:valentinarho    | 项目源码 | 文件源码
def get_similar_documents_for_query(model_id, text):
    """
    Return documents similar to the query or an empty set if an error occurs or the query has no words after preprocessing
    :param model_id:
    :param text:
    :return:
    """
    model = db_utils.get_model(model_id)
    topics_assignment = assign_topics_for_query(model_id, text)

    if len(topics_assignment) != 0:
        topics_vector = transform_topics_assignment_from_lda_to_vector(model['number_of_topics'], topics_assignment[0])
        # print(topics_vector)
        return get_similar_documents_by_vector(model_id, topics_vector)
    else:
        return []
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_binary(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      epsilon=0.1,
                                                      eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='log',
                                                      n_iter=10,
                                                      n_jobs=1,
                                                      penalty='l2',
                                                      power_t=0.5,
                                                      random_state=None,
                                                      shuffle=True,
                                                      verbose=0,
                                                      warm_start=False
            )))
        ])
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])
项目:QAServer    作者:fssqawj    | 项目源码 | 文件源码
def wash(fileList):
    # denyPos = ['CC', 'CD', 'DT', 'TO', '']
    st = LancasterStemmer()
    for f in tqdm(fileList):
        fr = open('./washFile/' + f, 'r')
        fw = open("./washFile_stem/" + f, 'w')
        for line in fr.read().splitlines():
            line = remove_punctuation(line).lower()
            # wordpos = pos(remove_punctuation(line).lower())
            # for turple in wordpos:
            #     if (turple[0] not in stopwords.words('english')):
            #         fw.write(turple[0] + ' ')
            # fw.write(x + ' ' for x in line.split() if x not in stopwords.words('english'))
            # stopw = stopwords.words('english')
            words = [x for x in line.split()]
            for x in words:
                try:
                    fw.write(st.stem(x) + ' ')
                except:
                    print x

        fr.close()
        fw.close()
项目:twitter_mongodb_helper    作者:IDEA-NTHU-Taiwan    | 项目源码 | 文件源码
def count_entries(file_list):
    """Performs a count of the number of number of words in the corpus
     Args:
        file_list  (list): list of file names.

    Returns:
        list: A list of json objects containing the count per file name
    """
    result = []
    for obj in file_list:
        with open(CSV_PATH + obj + '.csv', "r") as entry:
            reader = csv.reader(entry, delimiter=",")
            col_count = len(reader.next())
            res = {"Filename": obj, "Count": col_count}
            result.append(res)
    return result
项目:quora_duplicate    作者:ijinmao    | 项目源码 | 文件源码
def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list, (sentence_len, word_len)

    # Output shape
        (sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1

    n_xx = label_word_fd.N()
    high_info_words = set()

    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in label_word_fd[label].items():
            n_ix = word_fd[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words
项目:TensorFlow-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def build_dictionary(sentences, vocabulary_size):
    # Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]

    # Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]

    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))

    # Now create the dictionary
    word_dict = {}
    # For each word, that we want in the dictionary, add it, then make it
    # the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)

    return(word_dict)


# Turn text data into lists of integers from dictionary
项目:tensorflow-deep-qa    作者:shuishen112    | 项目源码 | 文件源码
def load_text_vec(alphabet,filename="",embedding_size = 100):
    vectors = {}
    with open(filename) as f:
        i=0
        for line in f:
            i+=1
            if i % 100000 == 0:
                    print 'epch %d' % i
            items = line.strip().split(' ')
            if len(items) == 2:
                vocab_size, embedding_size= items[0],items[1]
                print ( vocab_size, embedding_size)
            else:
                word = items[0]
                if word in alphabet:
                    vectors[word] = items[1:]
    print 'embedding_size',embedding_size
    print 'done'
    print 'words found in wor2vec embedding ',len(vectors.keys())
    return vectors
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def add_list_of_words_in_w2v_model(self, unknown_words):
        huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
        current_w2v_model_file = open(self.w2v_model_path, "a")
        line = huge_w2v_model_file.readline()
        unknown_words_left = len(unknown_words)
        while line and unknown_words_left:
            word = line.split()[0]
            if word in unknown_words:
                current_w2v_model_file.write(line)
                unknown_words = unknown_words - set([word])
                unknown_words_left -= 1
            line = huge_w2v_model_file.readline()
        for word in list(unknown_words):
            random_position = random(self.w2v_model.vector_size)*2-1
            current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
            print "warning random positions introduced for new words ... in the future this should be solved"
        current_w2v_model_file.close()
        huge_w2v_model_file.close()
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def add_list_of_words_in_w2v_model(self, unknown_words):
        huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
        current_w2v_model_file = open(self.w2v_model_path, "a")
        line = huge_w2v_model_file.readline()
        unknown_words_left = len(unknown_words)
        while line and unknown_words_left:
            word = line.split()[0]
            if word in unknown_words:
                current_w2v_model_file.write(line)
                unknown_words = unknown_words - set([word])
                unknown_words_left -= 1
            line = huge_w2v_model_file.readline()
        for word in list(unknown_words):
            random_position = random(self.w2v_model.vector_size)*2-1
            current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
            print "warning random positions introduced for new words ... in the future this should be solved"
        current_w2v_model_file.close()
        huge_w2v_model_file.close()
项目:refer-parser2    作者:lichengunc    | 项目源码 | 文件源码
def extract_NPs(chunk):
    """
    Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
    we extract the NPs with stopping and location words filtered out, and return list of noun phrases.
    """
    forbid_wds = stop_words + location_words
    NPs = []
    for phrase, ptype in chunk:
        if ptype == 'NP':
            filtered_wds = []
            for wd in phrase.split():
                if wd not in forbid_wds:
                    filtered_wds += [wd]
            if len(' '.join(filtered_wds)) > 0:
                NPs += [' '.join(filtered_wds)]
    return NPs
项目:refer-parser2    作者:lichengunc    | 项目源码 | 文件源码
def extract_NNs(chunk, pos):
    """
    Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
    and pos [(word, pos)], e.g., [('man', 'NN')]
    we extract from NPs with stopping, location, color, size words filtered out, 
    and return list of NN words only.
    """
    forbid_wds = stop_words + location_words + color_words + size_words
    NNs = []
    for phrase, ptype in chunk:
        if ptype == 'NP':
            filtered_wds = []
            for wd in phrase.split():
                wd_pos = [p[1] for p in pos if p[0] == wd][0]
                if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD':  # we don't need JJ nor CD words neither.
                    filtered_wds += [wd]
            if len(' '.join(filtered_wds)) > 0:
                NNs += [' '.join(filtered_wds)]
    return NNs
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def process_text(self, text):
        flags = (UNICODE if sys.version < '3' and type(text) is unicode
                 else 0)
        regexp = self.regexp if self.regexp is not None else r"\w[\w']+"

        words = findall(regexp, text, flags)
        # remove stopwords
        words = [word for word in words]
        # remove 's
        words = [word[:-2] if word.lower().endswith("'s") else word
                 for word in words]
        # remove numbers
        words = [word for word in words if not word.isdigit()]

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nl.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems
项目:word2vec_experiments_kaggle_popcorn    作者:bigsnarfdude    | 项目源码 | 文件源码
def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)

    # Define a function to split a review into parsed sentences
项目:word2vec_experiments_kaggle_popcorn    作者:bigsnarfdude    | 项目源码 | 文件源码
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def extract_unigram_feats(document, unigrams, handle_negation=False):
    """
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self,
                 w=20,
                 k=10,
                 similarity_method=BLOCK_COMPARISON,
                 stopwords=None,
                 smoothing_method=DEFAULT_SMOOTHING,
                 smoothing_width=2,
                 smoothing_rounds=1,
                 cutoff_policy=HC,
                 demo_mode=False):


        if stopwords is None:
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
        self.__dict__.update(locals())
        del self.__dict__['self']
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd)