Python nltk 模块,ngrams() 实例源码

我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用nltk.ngrams()

项目:chatbot_ner    作者:hellohaptik    | 项目源码 | 文件源码
def ngram_list(n, word_list, stop_word_list=None):
        """
        Generate ngrams with width n excluding those that are entirely formed of stop words

        Args:
            n (int): i.e. 1, 2, 3...
            word_list (list of str): list of words
            stop_word_list (list of str, Optional): list of words that should be excluded while obtaining
                                                    list of ngrams

        Returns:
            list of str: List of ngrams formed from the given word list except for those that have all their tokes in
                         stop words list
        """
        stop_word_set = set(stop_word_list) if stop_word_list else []
        all_ngrams = nltk.ngrams(word_list, n)
        ngram_list = []
        for ngram in all_ngrams:
            lowered_ngram_tokens = map(lambda token: token.lower(), ngram)
            if any(token not in stop_word_set for token in lowered_ngram_tokens):
                ngram_list.append(' '.join(ngram))
        return ngram_list
项目:Turkish-Language-NLP-API    作者:WoodProgrammer    | 项目源码 | 文件源码
def get(self,person_id):
        n=2
        occurs=[]
        grams_arr=[]
        sixgrams = ngrams(str_read.split(), n)
        for grams in sixgrams:
            #print str(grams)
            x=NGram.compare('{}'.format(person_id),str(grams))
            occurs.append(x)
            grams_arr.append(str(grams))

        main_fields={'occurs':fields.String,"word":fields.String}
        datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])}
        x=marshal(datas,main_fields)
        #json.dumps(marshal(datas,main_fields))
        return x
项目:Turkish-Language-NLP-API    作者:WoodProgrammer    | 项目源码 | 文件源码
def get(self,person_id):
        n=2
        occurs=[]
        grams_arr=[]
        sixgrams = ngrams(str_read.split(), n)
        for grams in sixgrams:
            #print str(grams)
            x=NGram.compare('{}'.format(person_id.decode('latin-1')),str(grams))
            occurs.append(x)
            grams_arr.append(str(grams))

        main_fields={'occurs':fields.String,"word":fields.String}
        datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])}
        x=marshal(datas,main_fields)
        #json.dumps(marshal(datas,main_fields))
        return x
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def extract_ngrams2(sentences, stemmer, language, N=2):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    N: Length of the n-grams e.g. 1, 2

    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    for sent in sentences:
        sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
        ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
        for i, ngram in enumerate(ngram_items):
            ngram_str = ' '.join(ngram)

            ngrams_list.append(ngram_str)
    return ngrams_list
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def extract_nuggets(sentences, nugget_type, language):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']

    return: a list of noun phrases, events, named_entities
    [('new', 'york'), ('york', 'is'), ('a', 'city'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    nugget_list = []
    for sent in sentences:
        if nugget_type == 'n-grams':
            nugget_items = list(ngrams(sent2stokens(sent, language), 2))
        if nugget_type == 'NP':
            nugget_items = get_phrases(sent, 'NP')
        if nugget_type == 'Phrases':
            nugget_items = get_phrases(sent, 'Phrases')
        if nugget_type == 'NE':
            nugget_items = get_phrases(sent, 'NE')
        for nugget in nugget_items:
            nugget_list.append(' '.join(nugget))
    return nugget_list
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def add_sentences(self, sentences):
        """
        @type sentences: list[Sentence]
        """
        counter = self.counter
        G = self.G
        for sent in sentences:
            counter.update(ngrams(sent.tokens, self.N))
            G.add_nodes_from(sent.tokens)

        updated_edges = []
        for v in counter.elements():
            s = v[0]
            t = v[1]
            c = counter[v]
            updated_edges.append((s, t, c))

        G.add_weighted_edges_from(updated_edges)
项目:textkit    作者:learntextvis    | 项目源码 | 文件源码
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep))
项目:textkit    作者:learntextvis    | 项目源码 | 文件源码
def text2ngrams(sep, num, text):
    '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    try:
        tokens = nltk.word_tokenize(content)
        ngrams = list(nltk.ngrams(tokens, num))
        write_csv(ngrams, str(sep))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
项目:ewe_ebooks    作者:jaymcgrath    | 项目源码 | 文件源码
def __init__(self, body, author='Anonymous'):

        # accumulators
        hashtags = []

        # Now process cleaned up text with NLTK
        words = []
        bigrams = []
        trigrams = []
        quadgrams = []
        sentences = []


        words = word_tokenize(body)

        sentences.extend(sent_tokenize(body))

        # Strip whitespace from each sentence
        sentences = [sentence.strip() for sentence in sentences]

        bigrams = ngrams(body, 2)
        trigrams = ngrams(body, 3)
        quadgrams = ngrams(body, 4)

        self.body = body
        self.words = words
        self.bigrams = bigrams
        self.trigrams = trigrams
        self.quadgrams = quadgrams
        self.sentences = sentences
        self.hashtags = hashtags
        self.author = author

        #TODO: Create "hashtags" from arbitrary number of rarest words
项目:ngrambot    作者:jmcgover    | 项目源码 | 文件源码
def build_ngrams(tokens, low, high):
    LOGGER.debug("Building ngrams from %d to %d" % (low, high))
    assert low <= high
    assert low > 0
    grams = {}
    for n in range(low, high + 1):
        grams[n] = [g for g in ngrams(tokens, n)]
    return grams
项目:ngrambot    作者:jmcgover    | 项目源码 | 文件源码
def build_pos_ngrams(tagged, low, high):
    LOGGER.debug("Building POS ngrams from %d to %d" % (low, high))
    assert low <= high
    assert low > 0
    pos_tokens = []
    pos_words = defaultdict(list)
    for word, pos in tagged:
        pos_tokens.append(pos)
        pos_words[pos].append(word)
    grams = {}
    for n in range(low, high + 1):
        grams[n] = [g for g in ngrams(pos_tokens, n)]
    return grams, pos_words
项目:eXposeDeepNeuralNetwork    作者:joshsaxe    | 项目源码 | 文件源码
def ngrams_extract(string):
    if random.random() < SAMPLE_RATE:
        print '[*]',string
    l = list
    grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
    SIZE = 1024
    vec = zeros((SIZE,))
    for t in grams:
        vec[hash(t)%SIZE]+=1
    return log(vec+1.0)
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_word_ngrams(sequence, n=3):
    tokens = tokenize(sequence)
    return [' '.join(ngram) for ngram in ngrams(tokens, n)]
项目:stance_detection    作者:StanceDetection    | 项目源码 | 文件源码
def gen_training_features(self, bodies_fpath, stances_fpath):
        print 'Generating training features'
        self._train_bodies, self._train_stances = self._read(bodies_fpath, stances_fpath, True)

        print 'Generating ngrams'
        ng_start = time.time()
        self._train_unigrams = self._gen_ngrams(1, self._train_bodies, self._train_stances)
        ng_end = time.time()
        print 'ngrams generation time: ', (ng_end - ng_start), 'seconds'

        print 'Generating jaccard similarities'
        js_start = time.time()
        self.train_avg_sims, self.train_max_sims = self._gen_jaccard_sims(
                self._train_bodies,
                self._train_stances
        )
        js_end = time.time()
        print 'jaccard similarity generation time: ', (js_end - js_start), 'seconds'

        for i in range(len(self._train_stances)):
            labeled_feature = ({
                'unigrams':self._train_unigrams[i],
                'avg_sims':self.train_avg_sims[i],
                'max_sims':self.train_max_sims[i]},
                self._train_stances[i]['Stance'])
            self._labeled_feature_set.append(labeled_feature)
项目:stance_detection    作者:StanceDetection    | 项目源码 | 文件源码
def _get_ngrams(self, text, n):
        tokens = nltk.word_tokenize(text)
        tokens = [ token.lower() for token in tokens if len(token) > 1 ]
        return nltk.ngrams(tokens, n)
项目:stance_detection    作者:StanceDetection    | 项目源码 | 文件源码
def _get_ngrams(self, text, n):
        tokens = nltk.word_tokenize(text)
        tokens = [ token.lower() for token in tokens if len(token) > 1 ]
        ngram_list = list(nltk.ngrams(tokens, n))
        return ngram_list
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def get_word_ngrams(sequence, n=3):
    tokens = tokenize(sequence)
    return [' '.join(ngram) for ngram in ngrams(tokens, n)]
项目:clickbait    作者:bhargaviparanjape    | 项目源码 | 文件源码
def naive_bayes(analysis):  
    tags = []
    words = []
    deps_cc = []
    for sen in analysis["sentences"]:
        tags += sen['pos']
        words += sen['tokens']
        deps_cc += sen["deps_cc"]
    norm = normalize_title(tags, words)

    f1 = [] 
    current = list(nltk.ngrams(norm.split(), 1)) + list(nltk.ngrams(norm.split(), 2)) + list(nltk.ngrams(norm.split(),3))
    ngram_list = [' '.join(list(g)) for g in current]
    for pos in common_grams:
        if pos in ngram_list:
            f1.append(1)
        else:
            f1.append(0)
    f1 = numpy.array(f1).reshape(1, len(f1))

    #pos ngrams
    f2 = []
    current_pos = list(nltk.ngrams(tags, 1)) + list(nltk.ngrams(tags, 2)) + list(nltk.ngrams(tags,3))
    ngram_list = [' '.join(list(g)) for g in current_pos]
    for pos in common_pos_grams:
        if pos in ngram_list:
            f2.append(1)
        else:
            f2.append(0)
    f2 = numpy.array(f2).reshape(1, len(f2))
    # print f2.shape


    # syntactic ngrams
    f3 = []
    current_sngrams = list(syntactic_n_gram(deps_cc, 1)) + list(syntactic_n_gram(deps_cc, 2)) + list(syntactic_n_gram(deps_cc, 3))
    ngram_list = [' '.join(list(g)) for g in current_sngrams]
    for pos in common_sn_grams:
        if pos in ngram_list:
            f3.append(1)
        else:
            f3.append(0)
    f3 = numpy.array(f3).reshape(1, len(f3))

    return [clf1.predict(f1)[0], clf2.predict(f2)[0], clf3.predict(f3)[0]]
项目:clickbait    作者:bhargaviparanjape    | 项目源码 | 文件源码
def n_gram_analysis_simple(infile, gram, stop):
    ngram = dict()
    f = open(infile, "r" )
    #f2 = codecs.open(outfile, "w+", "utf-8")
    for l in f:
        x = nltk.ngrams(l.split(),gram)
        for w in x:
            # if stop:
            #   if w not in stops:
               #      if w in ngram:
               #          ngram[w]+=1
               #      else:
               #        ngram[w]=1
            if w in ngram:
                ngram[w] += 1
            else:
                ngram[w] = 1
    p = list(ngram.items())
    p.sort(key = lambda x: -x[1])
    print len(p)
    for x in p[:10]:
        sen = ' '.join(x[0])
        cnt = int(x[1])
        if cnt == 0:
            cnt = 1
        print sen, cnt
项目:LobbyTrack    作者:regardscitoyens    | 项目源码 | 文件源码
def getNGrams(raw_string, gram_nb):
    xgrams = ngrams(raw_string.split(), gram_nb)
    return xgrams
项目:Turkish-Language-NLP-API    作者:WoodProgrammer    | 项目源码 | 文件源码
def get(self,param_word):
        status=False
        n=2
        occurs=[]
        grams_arr=[]
        words=[]
        for key in r_server.scan_iter():
            words.append(key)

        #sixgrams = ngrams(str_read.split(), n)
        for keys in words:
            #print str(grams)
            x=NGram.compare('{}'.format(param_word.decode('latin-1')),str(keys))
            occurs.append(x)
            grams_arr.append(str(keys))

        for key in r_server.scan_iter():
            if key == param_word:
                status=True


        if status is True:
            main_fields_true={"word":fields.String,"status":fields.Boolean}
            datas_true={'word':"{}".format(param_word),'status':status}
            x_true=marshal(datas_true,main_fields_true)
            return x_true
        else:
            main_fields_false={'occurs':fields.String,"word":fields.String,"freq":fields.String,"status":fields.Boolean}
            datas_false={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))]),'freq':r_server.get(param_word),'status':status}
            x_false=marshal(datas_false,main_fields_false)
            return x_false

        #json.dumps(marshal(datas,main_fields))
        #if datas["status"]==True:
        #    return datas["word"]
        #else:
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def extract_ngrams(sentences, stoplist, stemmer, language, n=2):
    """Extract the ngrams of words from the input sentences.

    Args:
        n (int): the number of words for ngrams, defaults to 2
    """
    concepts = []
    for i, sentence in enumerate(sentences):

        # for each ngram of words
        tokens = sent2tokens(sentence, language)
        for j in range(len(tokens)-(n-1)):

            # initialize ngram container
            ngram = []

            # for each token of the ngram
            for k in range(j, j+n):
                ngram.append(tokens[k].lower())

            # do not consider ngrams containing punctuation marks
            marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
            if len(marks) > 0:
                continue

            # do not consider ngrams composed of only stopwords
            stops = [t for t in ngram if t in stoplist]
            if len(stops) == len(ngram):
                continue

            # stem the ngram
            ngram = [stemmer.stem(t) for t in ngram]

            # add the ngram to the concepts
            concepts.append(' '.join(ngram))
    return concepts
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def prune_ngrams(ngrams, stoplist, N=2):
    pruned_list = []
    for ngram in ngrams:
        items = ngram.split(' ')
        i = 0
        for item in items:
            if item in stoplist: i += 1
        if i < N:
            pruned_list.append(ngram)
    return pruned_list
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def get_tech(text):
    """Get all technologies from the top 1000 tags on StackOverflow.
    """
    sentences = sent_tokenize(text)
    techs = set()
    for s in sentences:
        tokens = word_tokenize(s)
        techs |= set(tag for tag in tags if tag in tokens)
        bigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 2)]
        techs |= set(tag for tag in tags if tag in bigrams)
        trigrams = ['-'.join(ngram) for ngram in ngrams(tokens, 3)]
        techs |= set(tag for tag in tags if tag in trigrams)
    return list(techs)