Python nltk 模块,ne_chunk() 实例源码

我们从Python开源项目中,提取了以下26个代码示例,用于说明如何使用nltk.ne_chunk()

项目:That-s-Fake    作者:rajeevdesai    | 项目源码 | 文件源码
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def process(self, fc, context=None):
        text_source = self.config.get('text_source')
        if text_source and text_source in fc:
            text = fc[text_source]
        else:
            return fc
        names = defaultdict(StringCounter)
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    label = chunk.label()
                    name = ' '.join(c[0] for c in chunk.leaves())
                    if not isinstance(name, unicode):
                        name = unicode(name, 'utf-8')
                    name = cleanse(name)
                    #print chunk.node, name
                    names[label][name] += 1
        for entity_type, name_counts in names.items():
            fc[entity_type] = name_counts
        return fc
项目:keras-textgen    作者:kenoma    | 项目源码 | 文件源码
def get_continuous_chunks(self, text):
         chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
         prev = None
         continuous_chunk = []
         current_chunk = []
         for i in chunked:
                 if type(i) == nltk.Tree:
                         current_chunk.append(" ".join([token for token, pos in i.leaves()]))
                 elif current_chunk:
                         named_entity = " ".join(current_chunk)
                         if named_entity not in continuous_chunk:
                                 continuous_chunk.append(named_entity)
                                 current_chunk = []
                 else:
                         continue
         return continuous_chunk
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def get_pos_tag(qind):
    q = index_q[qind]
    wl = str(q).lower().split()
    pos_l = nltk.pos_tag(wl)
    q1_pos = []
    for pos in pos_l:
        q1_pos.append(pos[1])
    return q1_pos

# def get_ner_tag(qind):
#     q = index_q[qind]
#     wl = str(q).lower().split()
#     ner_l = nltk.ne_chunk(wl)
#     q1_ner = []
#     for pos in ner_l:
#         q1_ner.append(pos[1])
#     return q1_ner
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def whereRules(sentenceOriginal):
    score = 0
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
    #                 score += 10

    # RULE 2
    for word in LOCPREP:
        if word in sentence:
            score += 4

    # RULE 3
    for word in LOCATION:
        if word in sentence:
            score += 6

    return score

# WHEN RULES
项目:newsname-match    作者:bahadasx    | 项目源码 | 文件源码
def performNameExtraction(text):
    #Returns a list of what NLTK defines as persons after processing the text passed into it.
    try:
        entity_names = []
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label') and chunk.label:
                        if chunk.label() == 'PERSON':
                        name_value = ' '.join(child[0] for child in chunk.leaves())
                        if name_value not in entity_names:
                            entity_names.append(name_value)
    except:
        print "Unexpected error:", sys.exc_info()[0]
    return entity_names
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:cvscan    作者:skcript    | 项目源码 | 文件源码
def fetch_name(resume_text):
  tokenized_sentences = nltk.sent_tokenize(resume_text)
  for sentence in tokenized_sentences:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
      if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON':
        chunk = chunk[0]
      (name, tag) = chunk
      if tag == 'NOUN':
        return name

  return "Applicant name couldn't be processed"
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def extract_entities(text):
    result=dict()
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
        # chunk.draw()
        if(isinstance(chunk, nltk.tree.Tree)):
            for subtree in chunk.subtrees(filter=lambda t: (t.label() == 'PERSON' or t.label() == 'GPE' or t.label() == 'LOCATION')):
                for leave in subtree.leaves():
                    if leave[0].lower() not in irrelevant_loc_words:
                        result[leave[0].lower()]=subtree.label()
    # print result
    return result
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:chitti    作者:bhuvi8    | 项目源码 | 文件源码
def find_named_entities(sent):
    tree = nltk.ne_chunk(sent)
    for st in tree.subtrees():
        if st.label() != 'S':
            logger.debug(st)
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
项目:feature_engineering    作者:webeng    | 项目源码 | 文件源码
def extract(self, text, entity_description=False):
        # We need to clean the text in each method otherwise when we present it
        # to the user, it will have a different format
        text = self.remove_return_lines_and_quotes(text)
        sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]

        # This function is quite expensive
        sentences = [nltk.pos_tag(sent) for sent in sentences]

        entities_all = {} if entity_description else []

        #stop = stopwords.words('english')
        # more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence']
        # stop = stopwords.words('english')
        # stop = stop + more_stop_words
        stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep keeps",
                "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"]

        for s in sentences:
            chunked = nltk.ne_chunk(s, binary=True)
            for n in chunked:
                if isinstance(n, nltk.tree.Tree):
                    if n.label() == 'NE':
                        entities_all = self.getEntity(n, stop, entities_all, entity_description)

        if entity_description:
            return entities_all
        else:
            return list(set(entities_all))
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract_org(sent):
    pos = pos_tag(nltk.tokenize.word_tokenize(sent))
    sentt = nltk.ne_chunk(pos, binary=False)
    org = []
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
        for leave in subtree.leaves():
            org.append(leave)
    return org
项目:Scaffold    作者:christina-hammer    | 项目源码 | 文件源码
def create_phrase(self, phrase_str): 

        tokenized_phrase = nltk.word_tokenize(phrase_str)
        tagged_phrase = nltk.pos_tag(tokenized_phrase)

        ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_tree))

        merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) 

        ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)        

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_list))        

        tokens = [] #list of tagged tuples
        for token in ne_chunk_list:
            if type(token) is nltk.tree.Tree:            
                tokens.append(self._tree_to_tuple(token))
            else:
                if (token[0] in self._keywords):                
                    token = (token[0], self._keywords[token[0]])
                tokens.append(token)

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(tokens))  

        phrase = Phrase(tokens)    
        return phrase 

    #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
    #output: list of tuples/trees containing nltk tokens
    #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
项目:Scaffold    作者:christina-hammer    | 项目源码 | 文件源码
def create_phrase(self, phrase_str): 

        tokenized_phrase = nltk.word_tokenize(phrase_str)
        tagged_phrase = nltk.pos_tag(tokenized_phrase)

        ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_tree))

        merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) 

        ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)        

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_list))        

        tokens = [] #list of tagged tuples
        for token in ne_chunk_list:
            if type(token) is nltk.tree.Tree:            
                tokens.append(self._tree_to_tuple(token))
            else:
                if (token[0] in self._keywords):                
                    token = (token[0], self._keywords[token[0]])
                tokens.append(token)

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(tokens))  

        phrase = Phrase(tokens)    
        return phrase 

    #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
    #output: list of tuples/trees containing nltk tokens
    #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
项目:cvscan    作者:skcript    | 项目源码 | 文件源码
def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '

        noun_phrases.append(noun_phrase.rstrip())

    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:
            organizations.add(noun_phrase.capitalize())

  return organizations
项目:MLAB_Intuit    作者:rykard95    | 项目源码 | 文件源码
def extract_all(use_random_forest):
    if use_random_forest:
        emails = rf_model()
        emails = [email for email in emails if email[0] != 'negatives_clean']
    else:
        db = utils.get_local_db()
        for collection in db.collection_names():
            if collection != 'negatives_clean':
                for record in db.get_collection(collection).find():
                    emails.append([collection] + [record['Text']])

    # find features for each email
    email_data = []
    for email_set in emails:
        email = email_set[1]
        fields = features[email_set[0]]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})
        email_data.append([email_set[0], email, matches])
    return email_data
项目:MLAB_Intuit    作者:rykard95    | 项目源码 | 文件源码
def extract_one(email):
    # use random-forest to find email category
    category = rf_categorize(email)
    if category != 'negatives_clean':
        fields = features[category]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})

        # return categorized email with field guess probablities
        return [category, email, matches]
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract_org(sent):
    pos = pos_tag(nltk.tokenize.word_tokenize(sent))
    sentt = nltk.ne_chunk(pos, binary=False)
    org = []
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
        for leave in subtree.leaves():
            org.append(leave)
    return org
项目:NLP_question_answering_system_project    作者:Roshrini    | 项目源码 | 文件源码
def whoRules(question, sentenceOriginal):
    score = 0
    hasNameQuestion = False
    hasNameSentence = False
    hasnameSentence = False
    hasHumanSentence = False
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'PERSON' in chunk.label() or 'ORGANIZATION' in chunk.label():
    #                 score += 10

    for item in question:
        if item in NAME:
            hasNameQuestion = True
            #break

        if item in HUMAN and item in sentence:
            score += 10

    for item in sentence:
        if item in NAME:
            hasNameSentence = True
        if 'name' in item:
            hasnameSentence = True
        if item in HUMAN:
            hasHumanSentence = True

    # RULE 2
    if not hasNameQuestion and hasNameSentence:
        score += 6

    # RULE 3
    if not hasNameQuestion and hasnameSentence:
        score += 4

    # RULE 4
    if hasNameSentence or hasHumanSentence:
        score += 4

    return score


# WHAT RULES