Python nltk 模块,bigrams() 实例源码

我们从Python开源项目中,提取了以下27个代码示例,用于说明如何使用nltk.bigrams()

项目:codenn    作者:sriniiyer    | 项目源码 | 文件源码
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:review-classification    作者:vishnupriyam    | 项目源码 | 文件源码
def createbigramvocabulary(reviewfile, vocabfile):
    createvocabulary(reviewfile, vocabfile)
    finput = open(reviewfile,"r")
    foutput = open(vocabfile,"a")

    all_bigrams = []
    for line in finput:
        tokenized_line = []
        tokenized_line.append('*')
        tokenized_line.extend(word_tokenize(line[1:]))
        tokenized_line.append('$')
        bgrms = bigrams(tokenized_line)
        all_bigrams.extend(bgrms)

    c = Counter(all_bigrams)

    for b in c:
        if (b[0] != "+" and b[0] != "-" and c[b] >= 3):
            foutput.write(b[0] + " " + b[1] + "\n")

    finput.close()
    foutput.close()
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def posTrigramsScore(trigrams,category,pos_tags_trigrams,labels):

    #keep pos tags bigrams of specific category
    trigrams_category = subList(pos_tags_trigrams,labels,category)

    #initialize dictionary
    d = {}

    #calculate score for every bigram
    for trigram in trigrams:
        d[trigram] = score(trigram,category,trigrams_category,pos_tags_trigrams)

    return d

#calculate bigram's f1 score
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def __init__(self,lexicon):
        #initialize two dictionaries (unigrams and bigrams)
        self.d_unigrams = {}
        self.d_bigrams = {}

        #select which lexicon to load
        if lexicon == 0 :
            self.loadHashtagLexicon1()
        elif lexicon == 1:
            self.loadHashtagLexicon2()
        elif lexicon == 2 :
            self.loadMaxDiffTwitterLexicon()
        elif lexicon == 3 :
            self.loadSentiment140Lexicon1()
        elif lexicon == 4 :
            self.loadSentiment140Lexicon2()
        elif lexicon == 5 :
            self.loadEmotionLexicon()
        else:
            print "Lexicon unavailable, please load another one"

    #HashtagSentimentAffLexNegLex
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def loadUnigrams(self,path,reverse=False):
        f = open(path)

        for line in f.readlines():
            line = line.decode('utf8')
            key = line.split("\t")[0]
            value = line.split("\t")[1]

            if reverse:
                self.d_unigrams[value]=float(key)
            else:
                self.d_unigrams[key]=float(value)

        f.close()

    #load bigrams lexicon
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def score(self,tokens):
        total = 0.0
        #score for unigrams
        for token in tokens:
            total += self.d_unigrams.get(token,0.0)

        #score for bigrams, if bigrams exist
        if len(self.d_bigrams)>0 :
            #list with bigrams of the message
            bigrams_list = Counter(list(bigrams(tokens))).keys()
            for bigram in bigrams_list :
                total += self.d_bigrams.get(bigram,0.0)

        return total

    #compute the number of tokens(words) that appear in the lexicon
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def getTrigramsSet(pos_bigrams):
    s = set()

    for x in pos_bigrams:
        for bigram in x:
            s.add(bigram)


    return list(s)

#calculate bigrams of every item of the list l
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def getBigrams(l):
    b = []
    for x in l:
        b.append(list(bigrams(x)))

    return b

#calculate trigrams of every item of the list l
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def posBigramsScore(bigrams,category,pos_tags_bigrams,labels):
    #keep pos tags bigrams of specific category
    bigrams_category = subList(pos_tags_bigrams,labels,category)

    #initialize dictionary
    d = {}

    #calculate score for every bigram
    for bigram in bigrams:
        d[bigram] = score(bigram,category,bigrams_category,pos_tags_bigrams)


    return d

#calculate pos trigram score
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def loadHashtagLexicon2(self):
        folder = "NRC-Hashtag-Sentiment-Lexicon-v0.1/"
        file1 = "unigrams-pmilexicon.txt"
        file2 = "bigrams-pmilexicon.txt"

        #clear previous dictionaries
        self.clearDictionaries()

        #load unigrams
        self.loadUnigrams(NRCLexicon.directory+folder+file1)

        #load bigrams
        self.loadBigrams(NRCLexicon.directory+folder+file2)

    #MaxDiff-Twitter-Lexicon
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def loadMaxDiffTwitterLexicon(self):
        folder = "MaxDiff-Twitter-Lexicon/"
        file1 = "Maxdiff-Twitter-Lexicon_-1to1.txt"

        #clear previous dictionaries
        self.clearDictionaries()

        #load unigrams - reverse = true due to the .txt file format
        self.loadUnigrams(NRCLexicon.directory+folder+file1,True)

        #this lexicon has no bigrams so d_bigrams remains empty

    #Sentiment140AffLexNegLex
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def loadSentiment140Lexicon1(self):
        folder = "Sentiment140AffLexNegLex/"
        file1 = "S140-AFFLEX-NEGLEX-unigrams.txt"
        file2 = "S140-AFFLEX-NEGLEX-bigrams.txt"

        #clear previous dictionaries
        self.clearDictionaries()

        #load unigrams
        self.loadUnigrams(NRCLexicon.directory+folder+file1)

        #load bigrams
        self.loadBigrams(NRCLexicon.directory+folder+file2)

    #Sentiment140-Lexicon-v0.1
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def loadSentiment140Lexicon2(self):
        folder = "Sentiment140-Lexicon-v0.1/"
        file1 = "unigrams-pmilexicon.txt"
        file2 = "bigrams-pmilexicon.txt"

        #clear previous dictionaries
        self.clearDictionaries()

        #load unigrams
        self.loadUnigrams(NRCLexicon.directory+folder+file1)

        #load bigrams
        self.loadBigrams(NRCLexicon.directory+folder+file2)

    #NRC-Emotion-Lexicon-v0.92
项目:textkit    作者:learntextvis    | 项目源码 | 文件源码
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams]
项目:review-classification    作者:vishnupriyam    | 项目源码 | 文件源码
def bigram_predict(testSet,PP,PN,positive_probabilities,negative_probabilities,unseen_pos_prob,unseen_neg_prob):
    predicted_class = []
    for review in testSet:
        negative_probab = math.log10(PN)
        positive_probab = math.log10(PP)
        review_words = []
        review_words.append('*')
        review_words.extend(word_tokenize(review))
        review_words.append('$')
        review_bigrams = bigrams(review_words)
        for w in review_bigrams:
            bigram = w
            w = w[0]+" " +w[1]
            if w in negative_probabilities and w in positive_probabilities:
                negative_probab = negative_probab + math.log10(negative_probabilities[w])
                positive_probab = positive_probab + math.log10(positive_probabilities[w])
            else:
                if bigram[0] in negative_probabilities and bigram[0] in positive_probabilities:
                    #if(bigram[0] == '*'):
                    #    negative_probab = negative_probab
                    #    positive_probab = positive_probab
                    #else:
                        #if(negative_probabilities[bigram[0]] < 0 or positive_probabilities[bigram[0]] < 0):
                        #    print("issue with " + bigram[0] + " " + str(negative_probabilities[bigram[0]]) + " " + str(positive_probabilities[bigram[0]]))
                        #if(negative_probabilities[bigram[0]] > 0 and positive_probabilities[bigram[0]] > 0):
                    negative_probab = negative_probab + math.log10(negative_probabilities[bigram[0]])
                    positive_probab = positive_probab + math.log10(positive_probabilities[bigram[0]])
                else:
                    negative_probab = negative_probab + math.log10(unseen_neg_prob)
                    positive_probab = positive_probab + math.log10(unseen_pos_prob)

        if(negative_probab > positive_probab):
            result = '-'
        else:
            result = '+'
        predicted_class.append(result)
    return predicted_class
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def get_valid_bigram_words(self, words):
        _words = []
        for i in nltk.bigrams(words):
            if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
                if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
                    if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
                        _words.append(" ".join(i))
        return _words
项目:Tzara---A-Personal-Assistant    作者:Suman7495    | 项目源码 | 文件源码
def Markov_generate_unigram(seed):
    seed = ''.join(seed)
    counter = 1
    next_word_list = []
    for i in data:
        if seed == i:
            next_word_list.append(data[counter])
        counter += 1
    if len(next_word_list) == 0:
        return nltk.bigrams(["you", "are"])
    cfdist = nltk.FreqDist(next_word_list)
    next_word = cfdist.max()
    return nltk.bigrams([seed, next_word])
项目:Tzara---A-Personal-Assistant    作者:Suman7495    | 项目源码 | 文件源码
def Markov_generate_bigrams(tuples):
    counter = 1
    index_list = []
    data_bigrams = nltk.bigrams(data)
    for i in data_bigrams:
        if tuples == i:
            index_list.append(data[counter+1])
        counter += 1    
    return index_list
项目:ircbot    作者:pbzweihander    | 项目源码 | 文件源码
def calc_cfd(doc):
    # Calculate conditional frequency distribution of bigrams
    words = [w for w, t in Mecab().pos(doc)]
    bigrams = nltk.bigrams(words)
    return nltk.ConditionalFreqDist(bigrams)
项目:Tzara---A-Personal-Assistant    作者:Suman7495    | 项目源码 | 文件源码
def converse(raw_sentence):
    words_in_sent = raw_sentence.split()
    if len(words_in_sent) > 1:
        bigrams = nltk.bigrams(words_in_sent)

    else:
        bigrams = Markov_generate_unigram(words_in_sent)

    text_len = 20
    generated_lines = []

    for tuples in bigrams:
        line = []
        line.append(''.join(tuples[0]).title()+" ")
        line.append(''.join(tuples[1])+" ")
        for i in range(text_len):
            next_words = Markov_generate_bigrams(tuples)
            if not next_words: 
                break
            cfdist = nltk.FreqDist(next_words)
            next_word = cfdist.max()
            line.append(next_word+" ")
            new_tuple = (tuples[1], next_word)
            del tuples
            tuples = new_tuple

        generated_lines.append(line)

    longest_line = ''
    for line in generated_lines:

        stri = ''.join(line)
        if "." in stri:
            truncate_char = "."
        elif "?" in stri:
            truncate_char = "?"
        elif "!" in stri:
            truncate_char = "!"
        try:
            stri = stri[:stri.index(truncate_char)]
        except:
            pass
        if len(line) > len(longest_line):
            longest_line = stri.strip()+"."

    return longest_line