Python nltk.stem.porter 模块,PorterStemmer() 实例源码

我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用nltk.stem.porter.PorterStemmer()

项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def select_top_words(word_list, n=10):
    """ Filter out cluster term names"""
    import re
    from nltk.stem.porter import PorterStemmer
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    st = PorterStemmer()
    out_st = []
    out = []
    for word in word_list:
        word_st = st.stem(word)
        if len(word_st) <= 2 or\
                re.match('\d+', word_st) or \
                re.match('[^a-zA-Z0-9]', word_st) or\
                word in COMMON_FIRST_NAMES or \
                word in CUSTOM_STOP_WORDS or\
                word in ENGLISH_STOP_WORDS or \
                word_st in out_st:  # ignore stemming duplicate
            continue
        out_st.append(word_st)
        out.append(word)
        if len(out) >= n:
            break
    return out
项目:political-ad-classifier    作者:BoudhayanBanerjee    | 项目源码 | 文件源码
def porter(inputpath=None, text=None):
    """
    docstring
    """
    data = ''
    p = PorterStemmer()
    if inputpath:
        filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
        pstemmed_list = []
        for file in filenames:
            with open(file, 'r') as f:
                data = f.read()
                if data:
                    texts = data.split(',')
                    stemmedfile = []
                    for text in texts:
                        pstemmed = p.stem(text)
                        stemmedfile.append(pstemmed)
            pstemmed_list.extend(stemmedfile)
        return pstemmed_list
    if text:
        pstemmed = p.stem(text)
        return pstemmed
项目:patentdata    作者:benhoyle    | 项目源码 | 文件源码
def stem_split(tokens):
    """ Takes a list of tokens and splits stemmed tokens into
    stem, ending - inserting ending as extra token.

    returns: revised (possibly longer) list of tokens. """
    stemmer = PorterStemmer()
    token_list = list()
    for token in tokens:
        stem = stemmer.stem(token)
        split_list = token.split(stem)
        if token == stem:
            token_list.append(token)
        elif len(split_list) > 1:
            token_list.append(stem)
            token_list.append(split_list[1])
        else:
            token_list.append(split_list[0])
    return token_list
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def stem(words,stem_dic,mode="nltk",silent=1):
    if silent==0:
        print("stem ...")
    if mode == "nltk":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()
    else:
        print("unknown mode",mode)
        assert 0
    for word in set(words):
        if word not in stem_dic:
            stem_dic[word] = stemmer.stem(word)
    words = [stem_dic[word] for word in words]
    return words
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]
项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def __porter_stemmer(self):
        """Initializes PorterStemmer

        Returns:
            Initializes PorterStemmer
        """
        self.stemmer = PorterStemmer()
项目:ml-projects    作者:saopayne    | 项目源码 | 文件源码
def tokenize(text):

    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
    return filtered_tokens
项目:Search-Engine    作者:SoufianEly    | 项目源码 | 文件源码
def __init__(self, full_word):
        self.full_word = full_word
        # TODO: Lemmatization requires downloads
        # wnl = WordNetLemmatizer()
        # lemmas = [wnl.lemmatize(token) for token in tokens]
        self.stem = PorterStemmer().stem(full_word).lower()
项目:AcronymExpansion    作者:adityathakker    | 项目源码 | 文件源码
def get_list():
    stop_words = set(stopwords.words('english'))

    filename = 'data/new_acronyms.json'
    f = open(filename, 'r')
    data = json.load(f)
    paragraph_list = []
    full_form_list = []
    for k,v in data.items():
        if k=="WDM":
            for poss in v['possibilities']:
                paragraph_list.append(poss['summary'])
                full_form_list.append(poss['full_form'])
    s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
    paragraph_list.append(s)
    full_form_list.append("Wavelength context")
    texts = []
    taggeddoc = []
    p_stemmer = PorterStemmer()
    tokeniser = RegexpTokenizer(r'\w+')

    for index, para in enumerate(paragraph_list):
        raw = para.lower()

        tokens = tokeniser.tokenize(raw)
        stopped_tokens = [t for t in tokens if not t in stop_words]

        number_tokens = [x for x in stopped_tokens if x.isalpha]
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        texts.append(length_tokens)
        td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])

        taggeddoc.append(td)

    return taggeddoc
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split() if word not in stop]

# Cambiamos a este stemmer que tiene soporte para español
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:twitter_trolls    作者:merqurio    | 项目源码 | 文件源码
def tweet_stemming(tweet, token_freqs):

    """
    Stems tweets words and counts diversty

    :param tweet: the tweet to analyze
    :type tweet: str or unicode

    :param token_freqs: counter of words frequency
    :type token_freqs: Counter

    :returns: words added to token_freqs
    :rtype: int
    """

    pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
    regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    porter = PorterStemmer()

    counter_tokens = 0
    tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE)  # remove URL
    tweet_url_removed_tokenized = word_tokenize(tweet_url_removed)  # tokenize tweet
    tweet_url_removed_tokenized_cleaned_stemming = []  # cleaned of URLs and hashs, and stemming

    for token in tweet_url_removed_tokenized:
        new_token = regex_punctuation.sub(u'', token)  # remove punctuation and hash
        if not new_token == u'':
            new_token_stemming = porter.stem(new_token)
            tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
            token_freqs[new_token_stemming] += 1
            counter_tokens += 1

    return counter_tokens
项目:asx-announce-analysis    作者:desiguel    | 项目源码 | 文件源码
def stem_list(word_list):
    """
    Return a tokenised text list.
    :param word_list: word list to be stemmed.
    :return: list
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in word_list]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:Searchable-Symmetric-Encryption    作者:IanVanHoudt    | 项目源码 | 文件源码
def __init__(self):

        # TODO: placeholder for password. Will eventually take
        # as an arg of some sort
        self.password = b"password"

        # TODO: need to sort out use of salt. Previously, salt was
        # randomly generated in initKeys, but the resulting pass-
        # words k & kPrime were different on each execution, and 
        # decryption was impossible. Hardcoding salt makes dectyption
        # possible but may be a bad short cut
        self.iv = None
        self.salt = "$2b$12$ddTuco8zWXF2.kTqtOZa9O"

        # Two keys, generated/Initialized by KDF
        (self.k, self.kPrime) = self.initKeys()

        # Two K's: generated/initialized by PRF
        self.k1 = None
        self.k2 = None

        # client's cipher (AES w/ CBC)
        self.cipher = self.initCipher()

        # Stemming tool (cuts words to their roots/stems)
        self.stemmer = PorterStemmer()
项目:patentdata    作者:benhoyle    | 项目源码 | 文件源码
def stem(tokens):
    """ Stem passed text tokens. """
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def __init__(self):
        self.ps = PorterStemmer()
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def __init__(self):
        self.ps = PorterStemmer()
项目:RottenCrawler    作者:kevin940726    | 项目源码 | 文件源码
def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview
项目:RottenCrawler    作者:kevin940726    | 项目源码 | 文件源码
def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview
项目:multilabel-classification    作者:jordicolomer    | 项目源码 | 文件源码
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and
                                  len(token) >= min_length, tokens))
    return filtered_tokens
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def tokenizer_porter(text):
    return [PorterStemmer().stem(word) for word in text.split()]
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def __init__(self): 
        self.stemmer = PorterStemmer()
项目:graph-based-semi-supervised-learning    作者:deerishi    | 项目源码 | 文件源码
def __init__(self): 
        self.stemmer = PorterStemmer()
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:b4msa    作者:INGEOTEC    | 项目源码 | 文件源码
def __init__(self, lang="spanish"):
        """
        Initializes the parameters for specific language
        """
        self.languages = ["spanish", "english", "italian", "german"]
        self.lang = lang

        if self.lang not in self.languages:
            raise LangDependencyError("Language not supported: " + lang)

        self.stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None)
        if self.stopwords is None:
            self.stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang)))
            LangDependency.STOPWORDS_CACHE[lang] = self.stopwords

        self.neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None)
        if self.neg_stopwords is None:
            self.neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang)))
            LangDependency.NEG_STOPWORDS_CACHE[lang] = self.neg_stopwords

        if self.lang not in SnowballStemmer.languages:
            raise LangDependencyError("Language not supported for stemming: " + lang)
        if self.lang == "english":
            self.stemmer = PorterStemmer()
        else:
            self.stemmer = SnowballStemmer(self.lang)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
项目:TextClassification    作者:AlgorTroy    | 项目源码 | 文件源码
def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    # empty bag of words
    bag_of_words = []

    # Iterate for string
    for string in tqdm(list_of_strings):
        string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True)

        bag_of_words.extend(string_tokens)

    if remove_alnums:
        bag_of_words = [bag for bag in bag_of_words if bag.isalpha()]
    elif remove_digits:
        bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))]

    bag_of_words.sort()

    # Stem and Lemmatize the data
    bag_of_words_stemmed = []

    for word in bag_of_words:
        try:
            bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word)))
        except:
            bag_of_words_stemmed.append(word)

    bag_of_words = list(bag_of_words_stemmed)

    # Remove stop words
    stop = set(stopwords.words('english'))
    print('Removing Stop words...')
    bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)]

    bow_counter = Counter(bag_of_words)
    bow_counter = OrderedDict(sorted(bow_counter.items()))

    return bow_counter
项目:moviegeek    作者:practical-recommender-systems    | 项目源码 | 文件源码
def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in data:
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model
项目:feature_engineering    作者:webeng    | 项目源码 | 文件源码
def extract_bigrams(self, text):

        text = self.remove_return_lines_and_quotes(text)
        bigrams = []

        st = PorterStemmer()
        stop = stopwords.words('english')

        more_stop_words = [
            '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
        stop = stopwords.words('english')
        stop = stop + more_stop_words

        tokens = st.stem(text)
        tokens = nltk.word_tokenize(tokens.lower())
        tokens = [i for i in tokens if i not in stop]
        tokens = [word for word in tokens if len(word) > 2]

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(2)
        top_bigrams = finder.nbest(bigram_measures.pmi, 1000)

        for bg in top_bigrams:
            bg = " ".join(bg)
            tag = nltk.pos_tag([bg])[0]

            if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
                bigrams.append(tag[0])

        return bigrams
项目:wikipedia_classifier    作者:LouisFoucard    | 项目源码 | 文件源码
def stem_tokens(tokens, stemmer = PorterStemmer()):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
项目:itunes    作者:kaminem64    | 项目源码 | 文件源码
def k_tokenizer(text):
    text = text.encode('ascii',errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]

    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([('ios', '9'),])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)

    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """

    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc=[]
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
项目:TextClassification    作者:AlgorTroy    | 项目源码 | 文件源码
def get_encoded_vector(list_of_words, new_string):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    if 'START_SEQ' not in list_of_words:
        list_of_words.append('START_SEQ')

    if 'UNKNOWN_WORDS' not in list_of_words:
        list_of_words.append('UNKNOWN_WORDS')

    if 'END_SEQ' not in list_of_words:
        list_of_words.append('END_SEQ')

    tokens = text_to_word_sequence(new_string, lower=True, split=" ")

    # Stem and Lemmatize the data
    token_stemmed = []

    for token in tokens:
        try:
            token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
        except:
            token_stemmed.append(token)

    tokens = list(token_stemmed)

    out = []

    all_unknown_words = True

    for token in tokens:
        if token in list_of_words:
            all_unknown_words = False
            out.append(list_of_words.index(token))
        else:
            out.append(list_of_words.index('UNKNOWN_WORDS'))
    if all_unknown_words:
        print('Sentence not recognised:', new_string)

    out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
    return out