Python nltk.tokenize 模块,wordpunct_tokenize() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.tokenize.wordpunct_tokenize()

项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
项目:Word2Vec    作者:hashbangCoder    | 项目源码 | 文件源码
def tokenize(directorys):
    full_content = ''
    for _file in os.listdir(directory):
        #disp_count = 5
        with open(directory+_file,'r') as f:
            contents = f.readlines()
            for item in contents:
                try:
                    sentence = item.split('\t')[1].strip()
                    full_content += sentence
                except IndexError:
                    continue
                # if np.random.binomial(1,0.1):

                #   print sentence
                #   time.sleep(2)               
                #   disp_count -=1 
                #   if not disp_count:
                #       print '*'*100
                #       break

                # else:
                #   print '#'

    return wordpunct_tokenize(full_content.lower())
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def load_unannotated_file(filepath='test.txt', nb_instances=None, tokenized_input=False):
    if tokenized_input:
        instances = []
        for line in codecs.open(filepath, 'r', 'utf8'):
            line = line.strip()
            if line:
                instances.append(line)
            if nb_instances:
                nb_instances -= 1
                if nb_instances <= 0:
                    break
        return instances
    else:
        from nltk.tokenize import wordpunct_tokenize
        W = re.compile(r'\s+')
        with codecs.open(filepath, 'r', 'utf8') as f:
            text = W.sub(f.read(), ' ')
        tokens = wordpunct_tokenize(text)
        if nb_instances:
            return tokens[:nb_instances]
        else:
            return tokens
项目:pymeetup_morphology    作者:srbutler    | 项目源码 | 文件源码
def _extract_tokens(self, file_text):
        """Extract tokens from a file and return a Counter dictionary.

        This method is designed specifically so that it can be overridden
        easily while maintaining _get_file_tokens and _get_dir_tokens.
        """

        token_dict = collections.Counter()

        # does a simple word and punctuation tokenization on the text
        tokens = wordpunct_tokenize(file_text)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
项目:pymeetup_morphology    作者:srbutler    | 项目源码 | 文件源码
def _extract_tokens(self, file_text):
        """Extract tokens from a Babel file and return a Counter dictionary."""

        token_dict = collections.Counter()

        # matches and removes beginning and end tags
        regex = re.compile(r'\[\d*\.\d*\]\n(.*)')
        matches = regex.findall(file_text)

        tokens = set()
        for match in matches:
            wp_tokenized = wordpunct_tokenize(match)
            tokens.update(wp_tokenized)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
项目:wikicrawl    作者:rodricios    | 项目源码 | 文件源码
def _get_revision_word_dist(self, page_title, revid):
        """"""
        revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]

        if revid in revids_to_word_dist:
            return revids_to_word_dist[revid]

        text = self._get_revision_text(page_title, revid)

        text = [word.lower() for word in wordpunct_tokenize(text)
                if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]

        pdist = StatsCounter(text).normalize()

        revids_to_word_dist[revid] = pdist

        return pdist
项目:wikicrawl    作者:rodricios    | 项目源码 | 文件源码
def _get_revision_word_dist(self, page_title, revid):
        """"""
        revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]

        if revid in revids_to_word_dist:
            return revids_to_word_dist[revid]

        text = self._get_revision_text(page_title, revid)

        text = [word.lower() for word in wordpunct_tokenize(text)
                if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]

        pdist = StatsCounter(text).normalize()

        revids_to_word_dist[revid] = pdist

        return pdist
项目:ar-embeddings    作者:iamaziz    | 项目源码 | 文件源码
def tokenize(text):
        """
        :param text: a paragraph string
        :return: a list of words
        """

        try:
            try:
                txt = unicode(text, 'utf-8')  # py2
            except NameError:
                txt = text  # py3
            words = wordpunct_tokenize(txt)
            length = len(words)
        except TypeError:
            words, length = ['NA'], 0

        return words, length
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words

        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:

                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out
项目:project-fortis    作者:CatalystCode    | 项目源码 | 文件源码
def __init__(self, lines):
        self.lookup = {}
        self.max_len = 0        
        ensure_package_path()
        from nltk.tokenize import wordpunct_tokenize as tokenize
        for line in lines:
            word_data = json.loads(line)
            # capture both positive and negative, choose one at scoring time
            pos_score, neg_score = word_data['pos'], word_data['neg']            
            terms = [word_data['word']]
            # TODO: make the sentiment scorer configurable
            if 'word_ar' in word_data:
                terms.append(word_data['word_ar'])
            if 'word_ur' in word_data:
                terms.append(word_data['word_ur'])
            for term in terms:
                # if a scores exists for a term use the least neutral score
                existing_scores = (0., 0.)
                if term in self.lookup:
                    existing_scores = self.lookup[term]
                self.lookup[term] = (max(pos_score, existing_scores[0]), max(neg_score, existing_scores[1]))
                # update the maximum token length to check
                self.max_len = max(len(tokenize(term)), self.max_len)
项目:project-fortis    作者:CatalystCode    | 项目源码 | 文件源码
def extract_keywords(sentence, keywords):
    # check if there are keywords for the sentence language
    language = sentence['Language']
    if language in keywords:
        languageKeywords = keywords[language]
        keywordMatches = []
        if languageKeywords != None:
            message = sentence['Sentence']
            # tokenize the sentence
            for keyword in sorted(languageKeywords):
                keywordRegex = languageKeywords[keyword]
                if keywordRegex.search(message):
                    # if match, add keyword canonical form to list
                    keywordMatches.append(keyword)
        sentence['Keywords'] = keywordMatches
    return sentence
项目:stochasticLDA    作者:qlai    | 项目源码 | 文件源码
def parseDocument(doc, vocab):
    wordslist = list()
    countslist = list()
    doc = doc.lower()
    tokens = wordpunct_tokenize(doc)

    dictionary = dict()
    for word in tokens:
        if word in vocab:
            wordtk = vocab[word]
            if wordtk not in dictionary:
                dictionary[wordtk] = 1
            else:
                dictionary[wordtk] += 1

    wordslist.append(dictionary.keys())
    countslist.append(dictionary.values())
    return (wordslist[0], countslist[0])
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def __generate_tensor(self, is_review, reverse=False):
        """

        :param is_review:
        :param reverse:
        :return:
        """
        seq_length = self.review_max_words if is_review else self.summary_max_words
        total_rev_summary_pairs = self.rev_sum_pair.shape[0]
        data_tensor = np.zeros([total_rev_summary_pairs,seq_length])

        sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]

        for index, entry in enumerate(sample.tolist()):
            index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
            # reverse if want to get backward form
            if reverse:
                index_lst = index_lst[::-1]
            # Pad the list
            if len(index_lst) <= seq_length:
                index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
            else:
                index_lst = index_lst[0:seq_length]

            data_tensor[index] = index_lst

        return data_tensor
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]
项目:csirtg-smrt-py    作者:csirtgadgets    | 项目源码 | 文件源码
def top_tokens(text):
    freq_dict = defaultdict(int)
    tokens = wordpunct_tokenize(text)

    for token in tokens:
        freq_dict[token] += 1

    return sorted(freq_dict, key=freq_dict.get, reverse=True)
项目:Alfred    作者:JohnGiorgi    | 项目源码 | 文件源码
def wikipediaAction(message):
    """Makes the appropriate calls to the wikipedia API for answer wiki queries.

    Args:
        message: An incoming text message
        processer: Instance of NLProcessor class

    Returns:
        A message indicating what action was taking with the wikipedia API
    """
    # tokenize input
    tokens = tokenize.wordpunct_tokenize(message)
    # filter stopwords, additionally, remove 'wiki' or 'wikipedia'
    tokens_filtered = remove_stopwords(tokens)
    tokens_filtered = [token for token in tokens_filtered if token != 'wiki' and token != 'wikipedia']
    # join filtered message
    message = ' '.join(tokens_filtered)

    # for debugging/testing
    print("(Highly) processed input: ", message)

    # Get the wikipedia summary for the request
    try:
        summary = wikipedia.summary(message, sentences = 1)
        url = wikipedia.page(message).url
        answer = summary + "\nSee more here: " + url
        if len(answer) > 500:
            answer = answer[0:500] + "\nSee wikipedia for more..."
    except:
        # handle all errors
        answer = "Request was not found using Wikipedia. Be more specific?"

    return answer
项目:fabric8-analytics-stack-analysis    作者:fabric8-analytics    | 项目源码 | 文件源码
def create_tags_for_package(package_name):
    """Create tags for a package based on its name."""
    stop_words = set(['org', 'com', 'io', 'ch', 'cn'])
    tags = []
    tags = set([tag.lower() for tag in wordpunct_tokenize(package_name) if
                tag not in string.punctuation and tag not in stop_words
                ])

    return list(tags)[:MAX_TAG_COUNT]
项目:Word2Vec    作者:hashbangCoder    | 项目源码 | 文件源码
def analyze_false(validData,validDataNumbers,validLabels,model):    
    'Calculating precision and recall for best model...'
    predictions = np.squeeze((model.predict(validDataNumbers) > 0.5).astype('int32'))
    c1_inds = np.where(validLabels == 1)[0]
    pos_inds = np.where((predictions+validLabels) == 2)[0] #np.squeeze(predictions) == validLabels
    neg_inds = np.setdiff1d(c1_inds,pos_inds)
    seq_lengths = np.zeros((validData.shape[0]))
    for ind,row in np.ndenumerate(validData):
            seq_lengths[ind] = len(wordpunct_tokenize(row.lower().strip())) 

    mean_true_length = np.mean(seq_lengths[pos_inds])   
    mean_false_length = np.mean(seq_lengths[neg_inds])

    return mean_false_length,mean_true_length
项目:Word2Vec    作者:hashbangCoder    | 项目源码 | 文件源码
def tokenize(directory,exclude_files):
    full_content = ''
    for _file in os.listdir(directory):
        #disp_count = 5
        if exclude_files  and (_file in exclude_files):
            continue
        with open(directory+_file,'r') as f:
            contents = f.readlines()
            for item in contents:
                try:
                    sentence = item.split('\t')[1].strip()
                    full_content += sentence
                except IndexError:
                    continue
                # if np.random.binomial(1,0.1):

                #   print sentence
                #   time.sleep(2)               
                #   disp_count -=1 
                #   if not disp_count:
                #       print '*'*100
                #       break

                # else:
                #   print '#'

    return wordpunct_tokenize(full_content.lower())
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:pymeetup_morphology    作者:srbutler    | 项目源码 | 文件源码
def _extract_tokens(self, file_text):
        """Extract tokens from a file and return a Counter dictionary."""

        token_dict = collections.Counter()

        # matches and removes beginning and end tags
        regex = re.compile(r'(<doc id.*>|<\/doc>)')
        data = regex.sub('', file_text)

        tokens = wordpunct_tokenize(data)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def get_words(sents = []):
  from nltk.tokenize import wordpunct_tokenize
  words = []
  for sent in sents:
    words.append(wordpunct_tokenize(sent))
  return words

# file_name = sys.argv[1]
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def tokenize_into_words(sents = []):
  words = []
  for sent in sents:
    words.append(wordpunct_tokenize(sent))
  return words
项目:pylade    作者:fievelk    | 项目源码 | 文件源码
def _extract_text_ngram_freqs(self, text):
        """Tokenize the text.

        For each token in the text, extract ngrams of different length (from 1
        to 5). Compute how many times each of these ngrams occur in the text.
        Then return a dictionary of { ngram: frequencies }.

        >>> implementation = CavnarTrenkleImpl()
        >>> ngrams = implementation._extract_text_ngram_freqs("HeLLo")
        >>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \
            'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1}
        True
        >>> ngrams = implementation._extract_text_ngram_freqs("CIAO")
        >>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \
            'cia': 1, 'iao': 1, 'ciao': 1}
        True

        """
        tokens = wordpunct_tokenize(text.lower()) # Force lower case
        # TODO: Delete numbers and punctuation
        # TODO: Should we use nltk twitter tokenizer?

        ngram_freqs = defaultdict(int)
        for token in tokens:
            for n in range(1, 6): # Use 1-grams to 5-grams
                for ngram in ngrams(token, n):
                    ngram_string = ''.join(ngram)
                    ngram_freqs[ngram_string] += 1
                # ngram_freqs[ngrams(token, n)] += 1

        return ngram_freqs
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
        print "text_to_sentence"
        #from nltk.tokenize import wordpunct_tokenize
        # Function to split a review into parsed sentences. Returns a 
        # list of sentences, where each sentence is a list of words
        #
        text=text.decode("utf8")
        from nltk.tokenize import sent_tokenize,wordpunct_tokenize
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        #raw_sentences = tokenizer.tokenize(text.strip())
        raw_sentences = sent_tokenize(text.strip())
        print "finish tokenize sentence",len(raw_sentences)
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:

            #print "sentence:",raw_sentence
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                #sentences.append( text_to_wordlist( raw_sentence, \
    #               remove_stopwords ))
                #print removePunctuation(raw_sentence).lower().split()
                print raw_sentence
                sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
                print wordpunct_tokenize(raw_sentence)
                #print  text_to_wordlist( raw_sentence, remove_stopwords )
        #    
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences
项目:NNED    作者:qolina    | 项目源码 | 文件源码
def locateWord(word, wordsArr):
    if word in wordsArr:
        return wordsArr.index(word)
    else:
        idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)]
        return idxs[0]
项目:NNED    作者:qolina    | 项目源码 | 文件源码
def negSent2JointTrain(negSents, posSentNum):
    neg_training_data = []
    for sentId, (sent_id, sent) in enumerate(negSents):
        wordsIn = wordpunct_tokenize(sent)
        sent = " ".join(wordsIn)
        eventTypeSequence = ["O" for i in range(len(wordsIn))]
        neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence))
    return neg_training_data
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def vis_att(pages_idx, query, alpha, wiki, vocab, idx):
    rows = [prm.root_page.title()]
    for pageidx in pages_idx[:-1]:
        if pageidx != -1:
            rows.append(wiki.get_article_title(pageidx).decode('utf-8', 'ignore').title())
        else:
            break
            #rows.append('Stop')

    rows = rows[::-1]

    columns = []
    for word in wordpunct_tokenize(query):
        if word.lower() in vocab:
            columns.append(str(word))
    columns = columns[:prm.max_words_query*prm.n_consec]

    alpha = alpha[:len(rows),:len(columns)]
    alpha = alpha[::-1]

    fig,ax=plt.subplots(figsize=(27,10))
    #Advance color controls
    norm = matplotlib.colors.Normalize(0,1)
    im = ax.pcolor(alpha,cmap=plt.cm.gray,edgecolors='w',norm=norm)
    fig.colorbar(im)
    ax.set_xticks(np.arange(0,len(columns))+0.5)
    ax.set_yticks(np.arange(0,len(rows))+0.5)
    ax.tick_params(axis='x', which='minor', pad=15)
    # Here we position the tick labels for x and y axis
    ax.xaxis.tick_bottom()
    ax.yaxis.tick_left()
    ax.axis('tight') # correcting pyplot bug that add extra white columns.
    plt.xticks(rotation=90)
    fig.subplots_adjust(bottom=0.2)
    fig.subplots_adjust(left=0.2)
    #Values against each labels
    ax.set_xticklabels(columns,minor=False,fontsize=18)
    ax.set_yticklabels(rows,minor=False,fontsize=18)
    plt.savefig('vis' + str(idx) + '.svg')
    plt.close()
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def Word2Vec_encode(texts, wemb):

    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out
项目:rake-nltk    作者:csurfer    | 项目源码 | 文件源码
def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.

        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list
项目:baal    作者:braingineer    | 项目源码 | 文件源码
def _on_start(self, utterance):
        # do all on start things
        # maybe clear all chart data structures
        # maybe clear agenda data structures
        self.agenda.clear()
        tokenized_utterance = tokenizer(utterance)
        self.utter_len = self.settings.utter_len = len(tokenized_utterance)
        self.left_buckets = [set() for _ in xrange(self.utter_len+1)]
        self.right_buckets = [set() for _ in xrange(self.utter_len+1)]
        self.initialize_agenda(tokenized_utterance)
        # Buckets are over dot indices, so are len=1
        # self._print_buckets()
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:project-fortis    作者:CatalystCode    | 项目源码 | 文件源码
def score(self, sentence):
        # track both positive and negative scores for sentence
        pos_score, neg_score = 0., 0.
        # assuming no contextual forms are used for Arabic
        ensure_package_path()
        from nltk.tokenize import wordpunct_tokenize as tokenize
        tokens = tokenize(sentence.lower())
        term_count = 0
        # using nested while loops here to accomodate early termination of 
        # inner loop, and updating the index of the outer loop based on the
        #  number of tokens used in the sub-phrase
        i = 0
        while i < len(tokens):
            matched = False
            j = min(self.max_len, len(tokens) - i)
            # check phrase lengths up to `max_len`
            while j > 0 and (i + j) <= len(tokens):
                sub_tokens = tokens[i : i + j]
                sub_word = ' '.join(sub_tokens)
                # if a match exist for phrase, update scores and counts
                if sub_word in self.lookup:
                    sub_word_scores = self.lookup[sub_word]
                    pos_score += sub_word_scores[0]
                    neg_score += sub_word_scores[1]
                    term_count += 1
                    matched = True
                    i += j
                    break
                j -= 1
            # if not matched, skip token
            if not matched:
                i += 1
        # if no terms matched, or scores are equal, return a neutral score
        if pos_score == neg_score:
            return 0.5
        # if sentence is more positive than negative, use positive word sense
        elif pos_score > neg_score:
            return 0.5 + pos_score / term_count / 2 
        # if sentence is more negative than positive, use negative word sense
        else:
            return 0.5 - neg_score / term_count / 2
项目:project-fortis    作者:CatalystCode    | 项目源码 | 文件源码
def create_keyword_regex(keyword):
    print 'create_keyword_regex'
    # import nltk
    ensure_package_path()
    from nltk.tokenize import wordpunct_tokenize as tokenize
    print 'tokenize ==> %s' % (keyword)
    tokens = tokenize(keyword)
    pattern = '\\s+'.join(tokens)
    pattern = '\\b%s\\b' % pattern
    print 'compile pattern ==> %s' % (pattern)
    return re.compile(pattern, re.I | re.UNICODE)
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:idea_relations    作者:Noahs-ARK    | 项目源码 | 文件源码
def tokenize(text, filter_stopwords=False, lowercase=True):
    words = wordpunct_tokenize(text)
    if filter_stopwords:
        words = [w for w in words if w not in STOPWORDS]
    return words
项目:QueryReformulator    作者:nyu-dl    | 项目源码 | 文件源码
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask
项目:QueryReformulator    作者:nyu-dl    | 项目源码 | 文件源码
def Word2Vec_encode(texts, wemb):

    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out
项目:QueryReformulator    作者:nyu-dl    | 项目源码 | 文件源码
def text2idx2(texts, vocab, dim, use_mask=False):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''

    if use_mask:
        out = -np.ones((len(texts), dim), dtype=np.int32)
        mask = np.zeros((len(texts), dim), dtype=np.float32)
    else:
        out = -2 * np.ones((len(texts), dim), dtype=np.int32)

    out_lst = []
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)[:dim]

        for j, word in enumerate(words):
            if word in vocab:
                out[i,j] = vocab[word]
            else:
                out[i,j] = -1 # Unknown words

        out_lst.append(words)

        if use_mask:
            mask[i,:j] = 1.

    if use_mask:
        return out, mask, out_lst
    else:
        return out, out_lst
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
项目:RapBattleAlexa    作者:akashlevy    | 项目源码 | 文件源码
def get_syllables(sonnet):

  from nltk.tokenize import wordpunct_tokenize
  tokens = [wordpunct_tokenize(s) for s in sonnet]
  punct = set(['.', ',', '!', ':', ';'])
  filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens]
  last = [ sentence[len(sentence) - 1] for sentence in filtered]

  syllables = [[(word, len(pron), pron) for (word, pron) in cmu_dict if word == w] for w in last]
  return syllables
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def compute_idx(pages_path_in, pages_path_out, vocab):


    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.max_words)
    else:
        shape=(f['text'].shape[0],prm.max_words)

    idxs = fout.create_dataset('idx', shape=shape, dtype=np.int32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type.lower() == 'section' or prm.att_segment_type.lower() == 'subsection':
                segs = ['']
                for line in text.split('\n'):
                    if prm.att_segment_type == 'section':
                        line = line.replace('===', '')
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line.lower() + '\n'
            elif prm.att_segment_type.lower() == 'sentence':
                segs = tokenizer.tokenize(text.lower().decode('ascii', 'ignore'))
            elif prm.att_segment_type.lower() == 'word':
                segs = wordpunct_tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter. Valid options are "section", "subsection", "sentence", or "word".')

            segs = segs[:prm.max_segs_doc]
            idxs_, _ = utils.text2idx2(segs, vocab, prm.max_words)
            idxs[i,:len(idxs_),:] = idxs_
            mask[i] = len(idxs_)
        else:
            idx, _ = utils.text2idx2([text.lower()], vocab, prm.max_words)
            idxs[i,:] = idx[0]
        i += 1

        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close()
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def get_candidates(qatp):

    print 'loading data...'
    idf = pkl.load(open(prm.idf_path, "rb"))
    wk = wiki.Wiki(prm.pages_path)

    print 'creating vocabulary...'
    vocab = {}
    for q,_,_,_ in qatp:
        words = wordpunct_tokenize(q.lower())
        for word in words:
            if word in idf:
                vocab[word] = {}


    print 'creating inverted index...'
    i = 0
    for text in wk.get_text_iter():
        if i%10000==0:
            print 'article', i
        words = wordpunct_tokenize(text.lower())
        for word in words:
            if word in vocab:
                vocab[word][i] = 0

        #if i > 500000:
        #    break
        i += 1

    print 'selecting pages...'
    candidates = []
    for i,[q,_,_,_] in enumerate(qatp):
        st = time.time()
        words = wordpunct_tokenize(q.lower())
        scores = {}

        for word in words:
            if word in vocab:
                if len(vocab[word]) < 100000:
                    for pageid in vocab[word].keys(): 
                        if pageid not in scores:
                            scores[pageid] = 0.
                        scores[pageid] += idf[word]
        idxs = np.argsort(np.asarray(scores.values()))[::-1]

        pages = scores.keys()

        if len(idxs)==0:
            print 'error question:', q

        c = OrderedDict()
        for idx in idxs[:prm.max_candidates]:
            c[pages[idx]] = 0

        candidates.append(c)
        print 'sample ' + str(i) + ' time ' + str(time.time()-st)

        #if i > 10000:
        #    break

    return candidates