Python nltk 模块,wordpunct_tokenize() 实例源码

我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用nltk.wordpunct_tokenize()

项目:ai-chatbot-framework    作者:alfredfrancis    | 项目源码 | 文件源码
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
项目:TwitterHinglishTranslation    作者:anant14014    | 项目源码 | 文件源码
def parseTweetSet(tweets_data_path):
    tweets_text = []
    tweets_file = open(tweets_data_path, "r")
    english_stopwords_set = set(stopwords.words('english'))
    for line in tweets_file:
        tweet = json.loads(line)
        text = tweet['text']
        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]
        words_set = set(words)
        common_elements = words_set.intersection(english_stopwords_set)
        if (len(common_elements)>2):
            tweets_text.append(tweet['text'])

    tweets_text_set = set(tweets_text)
    #print len(tweets_text)
    #print len(tweets_text_set)
    #print tweets_text_set
    return list(tweets_text_set)
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def tokenize(self, text):
        """
        Performs tokenization in addition to normalization.
        """
        return self.normalize(nltk.wordpunct_tokenize(text))
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens)
项目:twitter-gen-classifier-pt    作者:mvicente93    | 项目源码 | 文件源码
def tokenize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(string.lower().strip())
    else:
        return nltk.wordpunct_tokenize(string.strip())
项目:twitter-gen-classifier-pt    作者:mvicente93    | 项目源码 | 文件源码
def tokenize_and_normalize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(normalize(string).lower().strip())
    else:
        return nltk.wordpunct_tokenize(normalize(string).strip())
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def nonenglish(string):
    # '''Description: This function takes in the string of descriptions and return the string with nonenglish words removed (useful for course syllabi)
    #   Parameters: String of descriptions
    #   Output: the string with nonenglish words removed'''
    words = set(nltk.corpus.words.words())
    result=[w for w in nltk.wordpunct_tokenize(string) if w.lower() in words]
    return " ".join(result)
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def calculate_languages_ratios(text):
    """
    Compute per language included in nltk number of unique stopwords appearing
    in analyzed text.
    """
    languages_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = {word.lower() for word in tokens}
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        common_elements = words & stopwords_set
        languages_ratios[language] = len(common_elements)
    return languages_ratios
项目:TwitterHinglishTranslation    作者:anant14014    | 项目源码 | 文件源码
def translateHinglishTweets(tweets_text):
    counter = 0
    tweets_text_translated = []
    n = len(tweets_text)

    open_file = open("dictionary.pickle", "rb")
    dictionary = pickle.load(open_file)
    open_file.close()

    english_stopwords_set = set(stopwords.words('english'))

    for i in range(n):
        text = tweets_text[i]
        translated_text = ""
        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]
        for word in words:
            if word in english_stopwords_set:
                translated_text = translated_text + " " + word
            elif (word in dictionary):
                #print word + "-" + dictionary[word]
                translated_text = translated_text + " " + dictionary[word]
                counter = counter + 1
            else:
                translated_text = translated_text + " " + word
        tweets_text_translated.append(translated_text)

    #print counter
    return tweets_text_translated
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def __call__(self, text):
        '''
        @param text: the string of text to be tagged

        @returns: a list of tags respecting the order in the text
        '''


        sentences = nltk.sent_tokenize(text)
        punctuation = set(string.punctuation)
        proper_noun = lambda x: True if x == 'NN' else False

        tags = []

        #Giving importance to first sentece words.
        if len(sentences) > 0:
            #stripping away punctuation
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation])

            if len(words) > 1:
                tags.append(Tag(str(words[0][0])))
                for word, tag in words[1:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
            elif len(words) == 1:
                tags.append(Tag(str(words[0][0]), terminal=True))

        #Rest of the sentences
        for sent in sentences[1:]:
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation])
            if len(words) > 1:
                for word,tag in words[:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
            if len(words) > 0:
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
        return tags