Python nltk 模块,word_tokenize() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.word_tokenize()

项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def tokenize_and_stem(text):
    """
    First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    """
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'intern' == token:
                token = ''
            if 'student' == token:
                token = ''
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text
项目:That-s-Fake    作者:rajeevdesai    | 项目源码 | 文件源码
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def get_sentence_tokens(text):
    '''
    Given a text(review), return the token list of each sentence
    :param text:
    :return:
    '''
    sentences = sent_tokenize(text)

    sent_tokens = []
    for sentence in sentences:
        sent_token = word_tokenize(sentence)
        sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
        sent_tokens.append(sent_token)
    # remove stop words and short tokens

    # stemming, experiment shows that stemming works nothing...
    # if (stemming):
    #     stemmer = PorterStemmer()
    #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
    return sent_tokens
项目:review-classification    作者:vishnupriyam    | 项目源码 | 文件源码
def createbigramvocabulary(reviewfile, vocabfile):
    createvocabulary(reviewfile, vocabfile)
    finput = open(reviewfile,"r")
    foutput = open(vocabfile,"a")

    all_bigrams = []
    for line in finput:
        tokenized_line = []
        tokenized_line.append('*')
        tokenized_line.extend(word_tokenize(line[1:]))
        tokenized_line.append('$')
        bgrms = bigrams(tokenized_line)
        all_bigrams.extend(bgrms)

    c = Counter(all_bigrams)

    for b in c:
        if (b[0] != "+" and b[0] != "-" and c[b] >= 3):
            foutput.write(b[0] + " " + b[1] + "\n")

    finput.close()
    foutput.close()
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def word_count(message, word):
        """
        Computes the number of times a word appears in a message
        (case-insensitive).

        Args:
            message: A Message object.
            word: A string with no spaces.

        Returns:
            An int representing the number of times word (case-insensitive)
                appears in the text of message split by spaces.
        """
        if ' ' in word:
            raise ValueError('word cannot contain spaces')
        lowercase_tokens = [token.lower() for token in nltk.word_tokenize(message.text)]
        return lowercase_tokens.count(word.lower())
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
        for question in questions:
            for word in nltk.word_tokenize(question):
                counter[word.lower()] += 1
        for answer in answers:
            for word in nltk.word_tokenize(answer):
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx
项目:identifiera-sarkasm    作者:risnejunior    | 项目源码 | 文件源码
def tokenize_text( sample_text ):
    global sequence_lengths
    processed_text = []

    if cfg.remove_punctuation:
        cleaned = sample_text.lower().translate( t_table )
    else:
        cleaned = sample_text

    if cfg.use_casual_tokenizer:
        tokens = tknzr.tokenize( cleaned )
    else:
        tokens = nltk.word_tokenize( cleaned, language='english')

    if cfg.remove_stopwords:
        tokens = [w for w in tokens if not w in stopwords.words('english')]

    sequence_lengths.append( len( tokens ) )
    processed_text.extend( tokens )

    return processed_text
项目:shalo    作者:henryre    | 项目源码 | 文件源码
def process_imdb(fname, setting):
    labels, sentences = [], []
    filename = setting + ".csv"
    quota = [0,0]
    if setting == 'test':
        maxquota = 5000
    else:
        maxquota = 15000
    with open(os.path.join(fname, filename), 'rb') as f:
        csvreader = csv.reader(f)
        for line in csvreader: 
            label = 0 if line[0] ==  "1" else 1
            quota[label] += 1
            if quota[label] > maxquota:
                continue
            sentence = line[2].replace("\"", "")
            text = nltk.word_tokenize(sentence.decode('utf-8'))
            labels.append(int(label))
            sentences.append(text)
    return sentences, labels
项目:paraphrase-id-tensorflow    作者:nelson-liu    | 项目源码 | 文件源码
def tokenize(self, sentence):
        """
        Given a string, tokenize it into words (with the conventional notion
        of word).

        Parameters
        ----------
        sentence: str
            The string to tokenize.

        Returns
        -------
        tokenized_sentence: List[str]
            The tokenized representation of the string, as a list of tokens.
        """
        return nltk.word_tokenize(sentence.lower())
项目:cloud-vision    作者:GoogleCloudPlatform    | 项目源码 | 文件源码
def add(self, filename, document):
        """
        Add a document string to the index.
        """
        # You can uncomment the following line to see the words found in each
        # image.
        # print("Words found in %s: %s" % (filename, document))
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue
            if token in ['.', ',', ':', '']:
                continue
            if self.stemmer:
                token = self.stemmer.stem(token)
            # Add the filename to the set associated with the token.
            self.redis_token_client.sadd(token, filename)

        # store the 'document text' for the filename.
        self.redis_docs_client.set(filename, document)
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _identify_pronoun(self, answer):
        """Calculate percentage of pronouns within answer
        - Args:
            answer(str): answer text
        - Returns:
            percentage(float): ratio of pronouns in answer
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        # init variables
        num_pronouns = 0
        num_terms = len(post)
        percentage = 0
        for k, v in post:
            if v in pronoun_list:
                num_pronouns += 1
        percentage = float(num_pronouns) / num_terms
        return percentage
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _identify_pronoun2(self, sentence):
        """Calculate percentage of pronouns in the sentence that are in the answer
        - Args:
            sentence(str): question sentence 
        - Returns:
            pronoun_in_sentence(list): pronouns in sentence 
            sentence_len(int): length of current sentence 
        """
        text = nltk.word_tokenize(sentence)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        pronoun_in_sentence = []
        sentence_len = len(post)
        for k, v in post:
            if v in pronoun_list:
                pronoun_in_sentence.append(k)
        return pronoun_in_sentence, sentence_len
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _first_tagger_after_answer_span(self, question):
        """Get the first tagger after answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term after span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == '_____':
                index = idx + 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _first_tagger_before_answer_span(self, question):
        """Get the first tagger before answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term before span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == "_____":
                index = idx - 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
项目:squadgym    作者:aleSuglia    | 项目源码 | 文件源码
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("env_data", type=str, help="Generated environment data filename in JSON format")
    args = parser.parse_args()

    print("-- Initialized environment")
    env = SquadEnv(args.env_data)

    context, question = env.reset()
    done = False

    while not done:
        print("Context ids: {}".format(context))
        print("Question ids: {}".format(question))
        print("Context tokens: {}".format(ids2tokens(context, env.id2token)))
        print("Question tokens: {}".format(ids2tokens(question, env.id2token)))
        answer_tokens = tokens2ids(word_tokenize(input("Answer: ")) + ["#eos#"], env.token2id)

        question_reward = 0
        for token in answer_tokens:
            (context, question), reward, done, _ = env.step(token)
            question_reward += reward

        print("You got {} reward".format(question_reward))
项目:LDA-REST    作者:valentinarho    | 项目源码 | 文件源码
def LemNormalize(text):
    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)
    # shortword = re.compile(r'\W*\b\w{1,2}\b')
    # transformed = shortword.sub('', transformed)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # remove short words (less than 3 char)
    tokenized = [w for w in tokenized if len(w) > 3]
    tokenizer = LemTokens(tokenized)

    return tokenizer
项目:LDA-REST    作者:valentinarho    | 项目源码 | 文件源码
def LemNormalizeIt(text):

    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # apply lemming with morph it
    morph_it = load_morph_it()
    tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]

    return tokenized
项目:wntf    作者:tonybaloney    | 项目源码 | 文件源码
def tag(self, lines):
        '''
        Tokenize and categorise the words in the collection of
        text

        :param lines: The list of strings with the text to match
        :type  lines: ``list`` of ``str``

        :rtype: :class:
        :return:
        '''
        try:
            tokenized_words = nltk.word_tokenize(lines)
            return nltk.pos_tag(tokenized_words)
        except LookupError as le:
            print("Run install_words.py first")
            raise le
项目:resume-optimizer    作者:mhbuehler    | 项目源码 | 文件源码
def _generate_candidate_keywords(self, sentences, max_length=3):
        """Creates a list of candidate keywords, or phrases of at most max_length words, from a set of sentences"""
        phrase_list = []
        for sentence in sentences:
            words = map(lambda x: "|" if x in self.stopwords else x,
                        nltk.word_tokenize(sentence.lower()))
            phrase = []
            for word in words:
                if word == "|" or is_punctuation(word):
                    if len(phrase) > 0:
                        if len(phrase) <= max_length:
                            phrase_list.append(phrase)
                        phrase = []
                else:
                    phrase.append(word)

        return phrase_list
项目:hnmt    作者:robertostling    | 项目源码 | 文件源码
def get_tokenizer(name, lowercase):
    if name == 'char':
        if lowercase:
            return (lambda s: list(s.strip().lower()))
        else:
            return (lambda s: list(s.strip()))
    elif (name == 'space') or (name == 'bpe'):
        if lowercase:
            return (lambda s: s.lower().split())
        else:
            return str.split
    elif name == 'word':
        if lowercase:
            return (lambda s: word_tokenize(s.lower()))
        else:
            return word_tokenize
    else:
        raise ValueError('Unknown tokenizer: "%s"' % name)
项目:R-net    作者:matthew-z    | 项目源码 | 文件源码
def _set_tokenizer(self, tokenizer):
        """
        Set tokenizer

        :param tokenizer: tokenization method
        :return: None
        """
        if tokenizer == "nltk":
            self.tokenizer = nltk.word_tokenize
        elif tokenizer == "spacy":
            spacy_en = spacy.load("en")

            def spacy_tokenizer(seq):
                return [w.text for w in spacy_en(seq)]

            self.tokenizer = spacy_tokenizer
        else:
            raise ValueError("Invalid tokenizing method %s" % tokenizer)
项目:FYP-AutoTextSum    作者:MrRexZ    | 项目源码 | 文件源码
def map_coocurence(context_size, data):
    coocurrence_list = []
    try:
        if detect(data) == 'en':
            region = nltk.word_tokenize(data)
            for l_context, word, r_context in _context_windows(region, context_size, context_size):
                if isWord(word):
                    for i, context_word in enumerate(l_context[::-1]):
                        if isWord(context_word):
                            coocurrence_list.append(((word, context_word), 1 / (i + 1)))
                    for i, context_word in enumerate(r_context):
                        if isWord(context_word):
                            coocurrence_list.append(((word, context_word), 1 / (i + 1)))
    except LangDetectException:
        return coocurrence_list
    return coocurrence_list
项目:one-day-with-cling    作者:mariana-scorp    | 项目源码 | 文件源码
def from_sentence(sent):
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)

        dg = DependencyGraph()
        for (index, (word, tag)) in enumerate(tagged):
            dg.nodes[index + 1] = {
                'word': word,
                'lemma': '_',
                'ctag': tag,
                'tag': tag,
                'feats': '_',
                'rel': '_',
                'deps': defaultdict(),
                'head': '_',
                'address': index + 1,
            }
        dg.connect_graph()

        return dg
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:markov_bot    作者:18F    | 项目源码 | 文件源码
def train(self, chain_len = None):
        """ Trains the markov data structure by creating chains of desired length """
        if not chain_len:
            chain_len = self.CHAIN_LENGTH

        self.CHAIN_LEN = chain_len

        self.everything['corpus'] = {}
        self.corpus = self.everything['corpus']

        for f in self.everything['input']:
            for line in sent_tokenize( self.everything['input'][f] ):
                words = word_tokenize(line)

                for chain in self._make_chains(words):
                    k = " ".join( chain[:-1] ) # key is everything but last word
                    v = chain[-1] # value is last word

                    try:
                        self.corpus[k].append(v)
                    except:
                        self.corpus[k] = [v]
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )
项目:unsupervised-treelstm    作者:jihunchoi    | 项目源码 | 文件源码
def _convert_obj(self, obj):
        pre_sentence = obj['sentence1']
        hyp_sentence = obj['sentence2']
        if self.lower:
            pre_sentence = pre_sentence.lower()
            hyp_sentence = hyp_sentence.lower()
        pre_words = word_tokenize(pre_sentence)
        hyp_words = word_tokenize(hyp_sentence)
        pre = [self.word_vocab.word_to_id(w) for w in pre_words]
        hyp = [self.word_vocab.word_to_id(w) for w in hyp_words]
        pre_length = len(pre)
        hyp_length = len(hyp)
        label = obj['gold_label']
        if len(pre) > self._max_length or len(hyp) > self._max_length:
            return None
        if label == '-':
            return None
        label = self.label_vocab.word_to_id(label)
        return pre, hyp, pre_length, hyp_length, label
项目:RealEstateTelegramBot    作者:PeterZhizhin    | 项目源码 | 文件源码
def tokenize_me(file_text):
    #firstly let's apply nltk tokenization
    tokens = nltk.word_tokenize(file_text)

    #let's delete punctuation symbols
    tokens = [i for i in tokens if i not in string.punctuation]

    #deleting stop_words
    tokens = [i for i in tokens if i not in stop_words]

    #cleaning words
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]

    tokens = [stemmer.stem(i) for i in tokens]

    return set(tokens)
项目:poetic-inner-join    作者:emdaniels    | 项目源码 | 文件源码
def tokenize_sentences(self):
        # tokenize the sentences into words and count the word frequencies
        # get most common words, build index_to_word and word_to_index vectors
        self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
                                    self.sentences]
        word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
        print("Found %d unique word tokens." % len(word_freq.items()))

        vocab = word_freq.most_common(self.vocabulary_size - 1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict(
            [(w, i) for i, w in enumerate(self.index_to_word)])

        print("Using vocabulary size %d." % self.vocabulary_size)
        print(
            "The least frequent word is '%s' appearing %d times." % (
            vocab[-1][0], vocab[-1][1]))

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(self.tokenized_sentences):
            self.tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token for w in
                sent]
项目:kaggle    作者:rbauld    | 项目源码 | 文件源码
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

###############################################################################
# Train
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def get_review_sentences():
    '''
    Read the yelp review and return after sentence segmentattion
    :return:
    '''
    review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
    count_sentence = 0
    sentences = []

    for line in review_file:
        json_review = json.loads(line.strip())
        text = json_review.get("text").replace('\n','').lower()

        raw_sentences = sent_tokenize(text)
        for raw_sentence in raw_sentences:
            if len(raw_sentence.strip()) > 0:
                sent_tokens = word_tokenize(raw_sentence)
                sentences.append(sent_tokens)
    return sentences
项目:Humour-Detection    作者:srishti-1795    | 项目源码 | 文件源码
def createTrainingList(reviewLst):
    sds = SupervisedDataSet(100,1)
    for review in reviewLst:
        revString = unicode(review[1], errors='ignore')
        revSentences = nltk.word_tokenize(revString.strip())
        revWords = []
        for i in revSentences:
            revWords += i.lower().split()
        vec = 0
        for i in revWords:
            try:
                vec+=model[i]/2
            except:
                pass
        vec=vec/len(revWords)
        sds.addSample(vec,review[0])
    net = buildNetwork(100, 20, 1, hiddenclass=TanhLayer, outclass=SoftmaxLayer,bias=True)
    trainer = BackpropTrainer(net, sds)
    print "Error score:",trainer.train()
    print trainer.trainUntilConvergence(verbose=True,maxEpochs=100)
项目:newspapers    作者:dhh16    | 项目源码 | 文件源码
def token_func(input_string):
    tokens = nltk.word_tokenize(input_string)
    long_tokens = []
    refined_tokens = []
    # lemmatized_tokens = []
    stopwordlist = get_stopwordlist("../data/first_stopwordlist.txt")
    regex = re.compile('[^1-9a-zA-Z]')

    for token in tokens:
        token = regex.sub('', token)
        if len(token) > 3:
            long_tokens.append(token)
    lemmatized_tokens = dhh_preprocess_tools.hfst_words(long_tokens,
                                                        filter=('VERB',
                                                                'NOUN',
                                                                'ADJ',
                                                                'PROPN'))

    for token in lemmatized_tokens:
        token = token.lower()
        if token not in stopwordlist:
            refined_tokens.append(token)
    return refined_tokens
项目:NLP-Keyword-Extraction-Ensemble-Method    作者:Ashwin-Ravi    | 项目源码 | 文件源码
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)

        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
项目:thesis    作者:jonvet    | 项目源码 | 文件源码
def txt_to_sent(sentences, word_vec, tokenize=True):

    sentences = [['<s>']+s.split()+['</s>'] if not tokenize else ['<s>']+nltk.word_tokenize(s)+['</s>'] for s in sentences]
    n_w = np.sum([len(x) for x in sentences])

    # filters words without glove vectors
    for i in range(len(sentences)):
        s_f = [word for word in sentences[i] if word in word_vec]
        if not s_f:
            import warnings
            warnings.warn('No words in "{0}" (idx={1}) have glove vectors. Replacing by "</s>"..'.format(sentences[i], i))
            s_f = ['</s>']
        sentences[i] = s_f

    lengths = np.array([len(s) for s in sentences])
    n_wk = np.sum(lengths)

    print('Nb words kept : {0}/{1} ({2} %)'.format(n_wk, n_w, round((100.0 * n_wk) / n_w, 2)))

    return sentences
项目:facebook-message-analysis    作者:szheng17    | 项目源码 | 文件源码
def __init__(self, text):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.lowercase_tokens = [t.lower() for t in self.tokens]
        self.alpha_tokens = [t for t in self.lowercase_tokens if t.isalpha()]
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs"
                    .format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def build_numeric_text(vocab, text):
    wids = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            wids.append(vocab[word])
    return ",".join([str(x) for x in wids])


##################### main ######################
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def get_maxlens(train_data, test_data):
    story_maxlen, question_maxlen = 0, 0
    for stories, questions, _ in [train_data, test_data]:
        for story in stories:
            story_len = 0
            for sent in story:
                swords = nltk.word_tokenize(sent)
                story_len += len(swords)
            if story_len > story_maxlen:
                story_maxlen = story_len
        for question in questions:
            question_len = len(nltk.word_tokenize(question))
            if question_len > question_maxlen:
                question_maxlen = question_len
    return story_maxlen, question_maxlen
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    stories, questions, answers = data
    for story, question, answer in zip(stories, questions, answers):
        xs = [[word2idx[w.lower()] for w in nltk.word_tokenize(s)] 
                                   for s in story]
        xs = list(itertools.chain.from_iterable(xs))
        xq = [word2idx[w.lower()] for w in nltk.word_tokenize(question)]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(word2idx[answer.lower()])
    return pad_sequences(Xs, maxlen=story_maxlen),\
           pad_sequences(Xq, maxlen=question_maxlen),\
           np_utils.to_categorical(Y, num_classes=len(word2idx))
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
项目:recurrent-attention-for-QA-SQUAD-based-on-keras    作者:wentaozhu    | 项目源码 | 文件源码
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]