Python nltk 模块,pos_tag() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.pos_tag()

项目:That-s-Fake    作者:rajeevdesai    | 项目源码 | 文件源码
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def keyword_extractor(data):
    try:
        #np_extractor = NPExtractor(words_wo_stopwords(strip_tags(data)))
        #result = np_extractor.extract()
        text = words_wo_stopwords(strip_tags(data))

        #TODO this is duplicated job, should be improved
        words = word_tokenize(strip_tags(text))
        taggged = pos_tag(words)
        cleaned = filter_insignificant(taggged)
        text = " ".join(cleaned)
        wc = WordCloudMod().generate(text)
        result = list(wc.keys())[:10]
    except Exception as err:
        print(colored.red("At keywords extraction {}".format(err)))
        result = []

    return result


# TODO definitely can be better if we knew where content is
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _identify_pronoun(self, answer):
        """Calculate percentage of pronouns within answer
        - Args:
            answer(str): answer text
        - Returns:
            percentage(float): ratio of pronouns in answer
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        # init variables
        num_pronouns = 0
        num_terms = len(post)
        percentage = 0
        for k, v in post:
            if v in pronoun_list:
                num_pronouns += 1
        percentage = float(num_pronouns) / num_terms
        return percentage
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _identify_pronoun2(self, sentence):
        """Calculate percentage of pronouns in the sentence that are in the answer
        - Args:
            sentence(str): question sentence 
        - Returns:
            pronoun_in_sentence(list): pronouns in sentence 
            sentence_len(int): length of current sentence 
        """
        text = nltk.word_tokenize(sentence)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        pronoun_in_sentence = []
        sentence_len = len(post)
        for k, v in post:
            if v in pronoun_list:
                pronoun_in_sentence.append(k)
        return pronoun_in_sentence, sentence_len
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _first_tagger_after_answer_span(self, question):
        """Get the first tagger after answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term after span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == '_____':
                index = idx + 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _first_tagger_before_answer_span(self, question):
        """Get the first tagger before answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term before span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == "_____":
                index = idx - 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'
项目:wntf    作者:tonybaloney    | 项目源码 | 文件源码
def tag(self, lines):
        '''
        Tokenize and categorise the words in the collection of
        text

        :param lines: The list of strings with the text to match
        :type  lines: ``list`` of ``str``

        :rtype: :class:
        :return:
        '''
        try:
            tokenized_words = nltk.word_tokenize(lines)
            return nltk.pos_tag(tokenized_words)
        except LookupError as le:
            print("Run install_words.py first")
            raise le
项目:PyRATA    作者:nicolashernandez    | 项目源码 | 文件源码
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def tag_one(self, text, skip_unknown=True, **kwargs):
        """ POS-Tags the given text, optionally skipping unknown lemmas

            :param unicode text: Text to be tagged
            :param bool skip_unknown: Automatically emove unrecognized tags from the result

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
             Tag(word=u'to', pos=u'TO', lemma=u'to'),
             Tag(word=u'be', pos=u'VB', lemma=u'be'),
             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
        """
        return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
                                      skip_unknown)
项目:DogeGen    作者:MemeTrash    | 项目源码 | 文件源码
def _get_base_doge_words(self, eng_text):
        """
        Get all base words from text to make doge phrases from.
        eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy']

        Args:
            eng_text (str): Text to get words from.

        Returns:
            list[str]: List of lower case words to use from text.
        """
        phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation])
        tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()])
        chosen_words = []
        for word, tag in tagged_words:
            if tag[0] in ['N', 'V', 'J']:
                # make noun singular
                if tag[0] == 'N':
                    word = self._lemmatizer.lemmatize(word, pos='n')
                # make verb infinitive
                elif tag[0] == 'V':
                    word = self._lemmatizer.lemmatize(word, pos='v')
                chosen_words.append(word.encode('ascii', 'ignore'))  # lemmatize makes word unicode
        return list(set(chosen_words))
项目:DogeGen    作者:MemeTrash    | 项目源码 | 文件源码
def _get_doge_descriptors(self, word_ls):
        """
        Get descriptors for a set of doge words.
        eg. ['person', 'run'] -> ['much', 'very']

        Args:
            word_ls (list[str]): List of words to use.

        Returns:
            list[str]: List of doge descriptors, eg. 'much', 'very', in order.
        """
        tagged_words = nltk.pos_tag(word_ls)
        chosen_descriptors = []
        for word, tag in tagged_words:
            possible_descs = [MUCH, MANY, SUCH, SO, VERY]
            if tag[0] == 'J':
                possible_descs.remove(VERY)
                possible_descs.remove(SO)
            if len(chosen_descriptors) >= 2:
                allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors[-2:]]
            else:
                allowed_descriptors = [s for s in possible_descs if s not in chosen_descriptors]
            chosen_descriptors.append(random.choice(allowed_descriptors))
        return chosen_descriptors
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def extract_candidate_words(sents, tags=GOODTAGS, tagged=False, **kwargs):
    """
    Extracts key words based on a list of good part of speech tags.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Identify only good words by their tag
        for token, tag in sent:
            if tag in tags:
                for token in normalizer.normalize([token]):
                    yield token


##########################################################################
## Key phrase by text scoring mechanisms
##########################################################################
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def normalize(self, words):
        """
        Normalizes a list of words.
        """
        # Add part of speech tags to the words
        words = nltk.pos_tag(words)

        for word, tag in words:
            if self.lower: word = word.lower()
            if self.strip: word = word.strip()

            if word not in self.stopwords:
                if not all(c in self.punct for c in word):
                    if self.lemmatize:
                        word = self.lemmatizer.lemmatize(word, tag)

                    yield word
项目:PhenVar    作者:NCBI-Hackathons    | 项目源码 | 文件源码
def tagged_abstracts(RS_pmids_tokenizedabstracts_dict):

    """ Takes a dict of tokenized abstracts
    and tags them using the NLTK module for Natural Language Entities.
    Input dictionary: key is the RS ID, value is a dictionary where key is the pmid and value is a list of tokens"""
    RS_pmids_taggedabstracts_dict = {}
    for each_RS in RS_pmids_tokenizedabstracts_dict:
        pmids_taggedabstracts = {}
        pmids_tokenizedabstracts = RS_pmids_tokenizedabstracts_dict[each_RS]
        for pmid in pmids_tokenizedabstracts:
            taggedabstracts_list = []
            for token in pmids_tokenizedabstracts[pmid]:
                tagged = nltk.pos_tag(token)
                taggedabstracts_list.append(tagged)
            pmids_taggedabstracts[pmid] = taggedabstracts_list
        RS_pmids_taggedabstracts_dict[each_RS] = pmids_taggedabstracts
    return RS_pmids_taggedabstracts_dict
项目:one-day-with-cling    作者:mariana-scorp    | 项目源码 | 文件源码
def from_sentence(sent):
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)

        dg = DependencyGraph()
        for (index, (word, tag)) in enumerate(tagged):
            dg.nodes[index + 1] = {
                'word': word,
                'lemma': '_',
                'ctag': tag,
                'tag': tag,
                'feats': '_',
                'rel': '_',
                'deps': defaultdict(),
                'head': '_',
                'address': index + 1,
            }
        dg.connect_graph()

        return dg
项目:adaware-nlp    作者:mhw32    | 项目源码 | 文件源码
def prepare_sentence(words,
                     vectorizer=None,
                     lemmatizer=None,
                     max_words=78,
                     return_output=True):
    X = np.ones((max_words, 300))*ZERO_EPSILON
    if return_output:
        y = np.ones((max_words, 300))*ZERO_EPSILON
        raw_pos = [p[1]for p in pos_tag(words)]
        pos     = [str(treebank_to_simple(p, default=wordnet.NOUN)) for p in raw_pos]
        lemmas  = [str(lemmatizer(w, pos=p)) for (w,p) in zip(words, pos)]

    num_words = len(words) if len(words) <= max_words else max_words

    for word_i in range(num_words):
        word_vector = vectorizer(words[word_i])
        X[word_i, :] = word_vector

        if return_output:
            lemma_vector = lemmas[word_i]
            y[word_i, :] = vectorizer(lemma_vector)

    if return_output:
        return X, y
    return X
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):

    # Create the chunker that uses our grammar
    chunker = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.word_tokenize(sent))

        # Parse the sentence, converting the parse tree into a tagged sequence
        sent = normalize(sent)
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract phrases and rejoin them with space
        phrases = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda term: term[-1] != 'O'
            ) if key
        ]

        for phrase in phrases:
            yield phrase
项目:PoemGenerator    作者:eugenet12    | 项目源码 | 文件源码
def get_counts():
    global unigrams
    global bigrams
    global sentences

    for i in xrange(1, NUM_FILES+1):
        if i in SKIP:
            continue
        with open("Shakespeare_parsed/%03d" % i) as f:
            for line in f:
                tokens = get_tokens(line)
                tokens = [t.lower() for t in tokens]
                tags = nltk.pos_tag(tokens)
                if len(tokens) == 0:
                    continue
                sentences.append(tokens)
                prev_word = ""
                for token in tokens:
                    unigrams[token] += 1
                    if not prev_word == "":
                        bigrams[(prev_word,token)] += 1
                    prev_word = token

    top10_uni = unigrams.most_common()[:10]
    top10_bi = bigrams.most_common()[:10]
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def tag_contexts(doc_id):

    global tags
    if not tags :
        tags = nltk.data.load("help/tagsets/upenn_tagset.pickle")

    words = defaultdict(Counter)
    count = Counter()
    for context in get_contexts(doc_id) :
        for word, tag in nltk.pos_tag(tokenize(context)) :
            words[tag].update([word])

            count.update([tag])


    tag_common_words = {tag : ' '.join(zip(*tag_words.most_common(10))[0]) for tag, tag_words in words.items() }

    for tag, freq in count.most_common(15) :
        print "%4d\t%45s\t%s" % (freq, tags[tag][0], tag_common_words[tag])
项目:Hanhan_Play_With_Social_Media    作者:hanhanwu    | 项目源码 | 文件源码
def get_NN_entities(post):
    sentences = nltk.tokenize.sent_tokenize(post)
    token_sets = [nltk.tokenize.word_tokenize(s) for s in sentences]
    pos_tagged_token_sets = [nltk.pos_tag(t) for t in token_sets]
    pos_tagged_tokens = [t for v in pos_tagged_token_sets for t in v]

    all_entities = []
    previous_pos = None
    current_entities = []
    for (entity, pos) in pos_tagged_tokens:
        if previous_pos == pos and pos.startswith('NN'):
            current_entities.append(entity.lower())
        elif pos.startswith('NN'):
            if current_entities != []:
                all_entities.append(' '.join(current_entities))
            current_entities = [entity.lower()]
        previous_pos = pos
    return all_entities
项目:ai-chatbot-framework    作者:alfredfrancis    | 项目源码 | 文件源码
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
项目:jenova    作者:dungba88    | 项目源码 | 文件源码
def clean_text(raw_text, filtered_word_types):
    """Clean raw text for bag-of-words model"""
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)

    # Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # stem words
    stemmer = PorterStemmer()
    stemmed_words = list(map(stemmer.stem, words))

    # Remove stop words if requested
    if filtered_word_types is not None:
        tagged_text = nltk.pos_tag(stemmed_words)
        stemmed_words = [w for w, wtype in tagged_text if not wtype in filtered_word_types]

    # join together
    return " ".join(stemmed_words)
项目:Question-Answering-NNs    作者:nbogdan    | 项目源码 | 文件源码
def get_lemmas(sent, lemmatizer):
    stop_words = []
    res = []
    for word in sent:
        pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
        if pos == '':
            lemma = lemmatizer.lemmatize(word)
        else:
            lemma = lemmatizer.lemmatize(word, pos)
        #if(type(lemma) == unicode):
        #    lemma = lemma.encode('ascii', 'ignore')

        if lemma.isdigit():
            res.append('number')
        else:
            res.append(lemma)
    return res
项目:chitti    作者:bhuvi8    | 项目源码 | 文件源码
def pos_tag_questions(qstn_list):
    res = []
    count = 0 
    for i in qstn_list:
        r = []
        i = i.split(':')
        r.append(i[0])
        r.append(i[1].split()[0])
        i = i[1].split()
        del i[0]
        sent = nltk.word_tokenize(' '.join(i))
        r.append(nltk.pos_tag(sent))
        res.append(tuple(r))
        count += 1
        if (count % 100) == 0:
            print ("processed : " + str(count) )
    return res

#experiment with different features to get better accuracy
#also dont forget to to include the same feature extractor in the process_grammar.py
项目:rss_skill    作者:forslund    | 项目源码 | 文件源码
def __init__(self):
        super(RssSkill, self).__init__('RssSkill')
        self._is_reading_headlines = False
        self.feeds = {}
        self.cached_items = {}
        self.cache_time = {}
        try:
            pos_tag('advance')
        except LookupError:
            logger.debug('Tagger not installed... Trying to download')
            dler = Downloader()
            if not dler.download('averaged_perceptron_tagger'):
                logger.debug('Trying alternative source...')
                dler = Downloader(ALT_NLTK_DATA)
                dler.download('averaged_perceptron_tagger',
                         raise_on_error=True)
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def pos_tag_text(line,
                 token_pattern=token_pattern,
                 exclude_stopword=stopwords,
                 encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    for name in ["question1", "question2"]:
        l = line[name]
        ## tokenize
        tokens = [x.lower() for x in token_pattern.findall(l)]
        ## stem
        #tokens=l.lower().split()
        #print tokens
        tokens = stem_tokens(tokens, english_stemmer)
        line[name+'_stem']=' '.join(tokens)
        #print tokens
        if exclude_stopword:
            tokens = [x for x in tokens if x not in stopwords]
        tags = pos_tag(tokens)
        tags_list = [t for w,t in tags]
        tags_str = " ".join(tags_list)
        #print tags_str
        line[name+'_pos_tag'] = tags_str
    return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
       u'question2_pos_tag']]
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def get_pos_tag(qind):
    q = index_q[qind]
    wl = str(q).lower().split()
    pos_l = nltk.pos_tag(wl)
    q1_pos = []
    for pos in pos_l:
        q1_pos.append(pos[1])
    return q1_pos

# def get_ner_tag(qind):
#     q = index_q[qind]
#     wl = str(q).lower().split()
#     ner_l = nltk.ne_chunk(wl)
#     q1_ner = []
#     for pos in ner_l:
#         q1_ner.append(pos[1])
#     return q1_ner
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def getPOSLinks(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)
    pos = nltk.pos_tag(text)
    links = []
    link = []
    active = False
    for w in pos:
        part = w[1]
        word = w[0]
        if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")):
            active = True
        if(active):
            link.append(wordnet_lemmatizer.lemmatize(word))
        #extract main body
        if(active and (part == "PRP" or part[:2] == "NN" or part == "." )):
            active = False
            links.append(" ".join(link))
            link = []
    return links
项目:twitter-trends-summarizer    作者:yuva29    | 项目源码 | 文件源码
def tag(path, filename):
    print("Tagging "+path)
    WRITE_HANDLER = open(PREPROCESSED_DATA + filename.strip() + "_features", 'w')
    for line in open(path, 'r'):    
        tokens = line.split()
        if(len(tokens) == 0):
            continue
        tags = pos_tag(tokens) # tag

        features = list()
        for token in tags:
            tok = token[0]
            tag = token[1]
            if tok.lower() not in stop_words:
                features.append(tok+":"+tag)                
        if(len(features)>0):
            WRITE_HANDLER.write(str(features)+'\n\n')
        else: ## EMPTY lines
            WRITE_HANDLER.write('\n\n')
项目:Question-Answering-System    作者:AdityaAS    | 项目源码 | 文件源码
def _analyze_query(self):
        tagged = nltk.pos_tag(self.ir_query)
        ir_query_tagged = []
        for word, pos in tagged:
            pos = {
                pos.startswith('N'): wordnet.NOUN,
                pos.startswith('V'): wordnet.VERB,
                pos.startswith('J'): wordnet.ADJ,
                pos.startswith('R'): wordnet.ADV,
                }.get(pos, None)
            if pos:
                synsets = wordnet.synsets(word, pos=pos)
            else:
                synsets = wordnet.synsets(word)
            ir_query_tagged.append((word, synsets))

        # Add additional special hidden term
        ir_query_tagged.append(('cause', [wordnet.synset('cause.v.01')]))
        self.ir_query_tagged = ir_query_tagged
项目:StanfordNER    作者:pandahuang    | 项目源码 | 文件源码
def combine_pos_tag(self, pos_tag):
        noun = ['NN', 'NNS', 'NNP', 'NNPS']
        adjective = ['JJ', 'JJR', 'JJS']
        adverb = ['RB', 'RBR', 'RBS']
        verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        wh = ['WDT', 'WP', 'WRB']
        if pos_tag in noun:
            return 'NN'
        elif pos_tag in adjective:
            return 'JJ'
        elif pos_tag in adverb:
            return 'RB'
        elif pos_tag in verb:
            return 'VB'
        elif pos_tag in wh:
            return 'WP'
        else:
            return pos_tag
项目:alan    作者:camtaylor    | 项目源码 | 文件源码
def branch(words):
  """
    This initial filter of our input sentence.
    It tokenizes the words and tags the words with parts of speech.
    It then passes the tokenized and tagged words to 1 of 3 functions.
    A sentence is either declarative() , interrogative() , or imperative()

    Args:
      words (String): The words inputted by the user
    Returns:
      String: response from one of the three functions that handle type of sentences.
  """
  parts_of_speech =  nltk.pos_tag(nltk.word_tokenize(words))
  leading_word = parts_of_speech[0][1][0]
  if leading_word == 'W':
    return interrogative(parts_of_speech[1:])
  elif leading_word == "V":
    return imperative(parts_of_speech)
  else:
    declarative(parts_of_speech)
项目:sia-cog    作者:deepakkumar1984    | 项目源码 | 文件源码
def tokenize(data, language="english", filterStopWords = False, tagging = False):
    result = {}
    tags = []
    filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
    sent_token = nltk.tokenize.sent_tokenize(data, language)
    word_token = nltk.tokenize.word_tokenize(data, language)
    word_token = [w for w in word_token if not w in filterChars]
    if filterStopWords is True:
        stop_words = set(stopwords.words(language))
        word_token = [w for w in word_token if not w in stop_words]

    if tagging is True:
        tags = nltk.pos_tag(word_token)

    result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
    return json.loads(jsonpickle.encode(result, unpicklable=False))
项目:RePhraser    作者:MissLummie    | 项目源码 | 文件源码
def change_sentence(self):
        text = nltk.tokenize.word_tokenize(self._sentence)
        changed = False
        for cur in nltk.pos_tag(text):
            if (cur[1] == "NN" or cur[1] == "NNP" or cur[1] == "RPR"):
                foundedTmura = self.getFromDB(cur[0])
                if foundedTmura == None:
                    foundedTmura = getTmura(cur[0])
                    if foundedTmura != "not found":
                        self.add2DB(cur[0], foundedTmura)
                if foundedTmura != "not found" and changed == False:
                    if (foundedTmura.find("OR")):
                        foundedTmura = foundedTmura.replace('OR', 'or')

                    if randrange(2) == 0:
                        rep = cur[0] + ", " + foundedTmura + ", "
                    else:
                        rep = cur[0] + "(" + foundedTmura + ") "

                    self._sentence = self._sentence.replace(cur[0], rep)
                    changed = True
        return self._sentence
项目:AirbnbReviewAnalyzer    作者:mrsata    | 项目源码 | 文件源码
def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def whereRules(sentenceOriginal):
    score = 0
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
    #                 score += 10

    # RULE 2
    for word in LOCPREP:
        if word in sentence:
            score += 4

    # RULE 3
    for word in LOCATION:
        if word in sentence:
            score += 6

    return score

# WHEN RULES
项目:coala-bears    作者:coala    | 项目源码 | 文件源码
def check_imperative(self, paragraph):
        """
        Check the given sentence/s for Imperatives.

        :param paragraph:
            The input paragraph to be tested.
        :return:
            A list of tuples having 2 elements (invalid word, parts of speech)
            or an empty list if no invalid words are found.
        """
        words = nltk.word_tokenize(nltk.sent_tokenize(paragraph)[0])
        # VBZ : Verb, 3rd person singular present, like 'adds', 'writes'
        #       etc
        # VBD : Verb, Past tense , like 'added', 'wrote' etc
        # VBG : Verb, Present participle, like 'adding', 'writing'
        word, tag = nltk.pos_tag(['I'] + words)[1:2][0]
        if(tag.startswith('VBZ') or
           tag.startswith('VBD') or
           tag.startswith('VBG') or
           word.endswith('ing')):  # Handle special case for VBG
            return (word, tag)
        else:
            return None
项目:pinhook-tilde    作者:archangelic    | 项目源码 | 文件源码
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
项目:pinhook-tilde    作者:archangelic    | 项目源码 | 文件源码
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
项目:pinhook-tilde    作者:archangelic    | 项目源码 | 文件源码
def word_split(self, sentence):
        words = re.split(self.word_split_pattern, sentence)
        words = [w for w in words if len(w) > 0]
        words = ["::".join(tag) for tag in nltk.pos_tag(words)]
        return words
项目:postcards    作者:abertschi    | 项目源码 | 文件源码
def _find_nouns(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        nouns = [word for word, pos in tagged \
                 if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]

        filter_keywords = ['chuck', 'norris', 'quot']
        filtered = [i for i in nouns if not any(f in i.lower() for f in filter_keywords)]
        return filtered
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _count_token_with_match(self, answer, match):
        """Count answer match FLAG 
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        count = 0
        for k, v in post:
            if v == match:
                count += 1
        return count
项目:ask_data_science    作者:AngelaVC    | 项目源码 | 文件源码
def is_noun(word):
    POS = nltk.pos_tag([word])[0][1]
    return POS.startswith('NN')
项目:SocialNPHS    作者:SocialNPHS    | 项目源码 | 文件源码
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged
项目:open-sesame    作者:Noahs-ARK    | 项目源码 | 文件源码
def normalize_tokens(self):
        if len(self.stindices) != len(self.enindices):
            sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
            return
        start = {}
        idx = 0
        for s in sorted(self.stindices):
            self.stindices[s] = idx
            start[idx] = s
            idx += 1
        end = {}
        idx = 0
        for t in sorted(self.enindices):
            self.enindices[t] = idx
            end[idx] = t
            if idx > 0 and end[idx - 1] > start[idx]:
                sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
                return
            token = self.text[start[idx] : t + 1].strip()
            if " " in token:
                sys.stderr.write("\t\tIssue: incorrect tokenization "  + token + "\n")
                return
            if token == "": continue
            self.tokens.append(token)
            idx += 1
        try:
            self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)]
            for idx in xrange(len(self.tokens)):
                tok = self.tokens[idx]
                if self.nltkpostags[idx].startswith("V"):
                    self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
                else:
                    self.nltklemmas.append(lemmatizer.lemmatize(tok))
        except IndexError:
            print self.tokens
            print pos_tag(self.tokens)
        return True
项目:tokenquery    作者:ramtinms    | 项目源码 | 文件源码
def tag(self, tokens):
        """
            add pos tags to token objects

            :param tokens: list of token objects
            :type tokens: list(Token)
            :return: label augmented list of Token objects
            :rtype: list(Token)
        """
        tags = pos_tag([token.get_text() for token in tokens])
        for token, tag in zip(tokens, tags):
            token.add_a_label('pos', tag[1])
        return tokens
项目:QAServer    作者:fssqawj    | 项目源码 | 文件源码
def pos(text):
    tokens = nltk.word_tokenize(text)
    wordpos = nltk.pos_tag(tokens)
    return wordpos
项目:Education-Explorer    作者:imbiswas    | 项目源码 | 文件源码
def __tagPartsOfSpeech(words):
    return [pair[1] for pair in nltk.pos_tag(words)]
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def tag_one(self, text, tagset, **kwargs):
        """ POS-Tags the given text """
        return pos_tag(word_tokenize(text, tagset))