Python nltk 模块,tag() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tag()

项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _pos1_gram_after_answer(self, row, flag):
        """The first POS tag following the answer span is [FLAG]
        - Args:
            row(pandas.dataframe): input pandas dataframe
            flag(string): symbol to match first tagger
        - Returns:
            binary(int): 1 match, 0 not match
        """
        question = row.Question
        if question:
            first_tagger = self._first_tagger_after_answer_span(question)
            if first_tagger == flag:
                return 1
            else:
                return 0
        else:
            return 0
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _pos1_gram_before_answer(self, row, flag):
        """The first POS tag before the answer span is [FLAG]
        - Args:
            row(pandas.dataframe): input pandas dataframe
            flag(string): symbol to match first tagger
        - Returns:
            binary(int): 1 match, 0 not match
        """
        question = row.Question
        if question:
            first_tagger = self._first_tagger_before_answer_span(question)
            if first_tagger == flag:
                return 1
            else:
                return 0
        else:
            return 0
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]
项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def extract_JK(pos_seq):
    """The 'JK' method in Handler et al. 2016.  
    Returns token positions of valid ngrams."""

    def find_ngrams(input_list, num_):
        '''get ngrams of len n from input list'''
        return zip(*[input_list[i:] for i in range(num_)])

    # copied from M and S chp 5'''
    patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
    pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
    pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
    ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
    def stringify(s):
        return "".join(a[1] for a in s)
    def positionify(s):
        return tuple(a[0] for a in s)
    ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
    return [set(positionify(n)) for n in ngrams]

########
项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html
项目:SocialNPHS    作者:SocialNPHS    | 项目源码 | 文件源码
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged
项目:seq2seq-keyphrase    作者:memray    | 项目源码 | 文件源码
def load_xml(self, xmldir):
        '''
        for KDD/WWW/UMD only
        :return: doclist
        '''
        for filename in os.listdir(xmldir):
            with open(xmldir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.xml')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()
                    xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
                    root = ET.fromstring(xml)

                    doc.title = root.findall("title")[0].text
                    doc.abstract = root.findall("abstract")[0].text
                    doc.phrases = [n.text for n in root.findall("*/tag")]

                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename )
项目:seq2seq-keyphrase    作者:memray    | 项目源码 | 文件源码
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records, pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: '  + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
项目:seq2seq-keyphrase    作者:memray    | 项目源码 | 文件源码
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def conll_tag_chunks(chunk_sents):
    '''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
    so the final result is a list of lists of (tag, chunk_tag) tuples.
    >>> from nltk.tree import Tree
    >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
    >>> conll_tag_chunks([t])
    [[('DT', 'B-NP'), ('NN', 'I-NP')]]
    '''
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def parse(self, tagged_sent):
        '''Parsed tagged tokens into parse Tree of chunks'''
        if not tagged_sent: return None
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        # create conll str for tree parsing
        wtc = zip(words, chunks)
        return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def parse(self, tagged_sent):
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def parse(self, tagged_sent):
        iobs = []
        in_person = False

        for word, tag in tagged_sent:
            if word in self.name_set and in_person:
                iobs.append((word, tag, 'I-PERSON'))
            elif word in self.name_set:
                iobs.append((word, tag, 'B-PERSON'))
                in_person = True
            else:
                iobs.append((word, tag, 'O'))
                in_person = False

        return conlltags2tree(iobs)
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def iob_locations(self, tagged_sent):
        i = 0
        l = len(tagged_sent)
        inside = False

        while i < l:
            word, tag = tagged_sent[i]
            j = i + 1
            k = j + self.lookahead
            nextwords, nexttags = [], []
            loc = False
            # lookahead in the sentence to find multi-word locations
            while j < k:
                if ' '.join([word] + nextwords) in self.locations:
                    # combine multiple separate locations into single location chunk
                    if inside:
                        yield word, tag, 'I-LOCATION'
                    else:
                        yield word, tag, 'B-LOCATION'
                    # every next word is inside the location chunk
                    for nword, ntag in zip(nextwords, nexttags):
                        yield nword, ntag, 'I-LOCATION'
                    # found a location, so we're inside a chunk
                    loc, inside = True, True
                    # move forward to the next word since the current words
                    # are already chunked
                    i = j
                    break

                if j < l:
                    nextword, nexttag = tagged_sent[j]
                    nextwords.append(nextword)
                    nexttags.append(nexttag)
                    j += 1
                else:
                    break
            # if no location found, then we're outside the location chunk
            if not loc:
                inside = False
                i += 1
                yield word, tag, 'O'
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def ieer_chunked_sents(tag=nltk.tag.pos_tag):
    for doc in ieer.parsed_docs():
        tagged = ieertree2conlltags(doc.text, tag)
        yield conlltags2tree(tagged)
项目:textfool    作者:bogdan-kulynych    | 项目源码 | 文件源码
def _get_wordnet_pos(spacy_token):
    '''Wordnet POS tag'''
    pos = spacy_token.tag_[0].lower()
    if pos in ['a', 'n', 'v']:
        return pos
项目:textfool    作者:bogdan-kulynych    | 项目源码 | 文件源码
def _synonym_prefilter_fn(token, synonym):
    '''
    Similarity heuristics go here
    '''
    if  (len(synonym.text.split()) > 2) or \
        (synonym.lemma == token.lemma) or \
        (synonym.tag != token.tag) or \
        (token.text.lower() == 'be'):
        return False
    else:
        return True
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:polyglot-quiz    作者:erkanay    | 项目源码 | 文件源码
def map_words(self, _text):
        mapping = defaultdict(list)
        tagged_words = pos_tag(set(self.get_words(_text)))
        for word, tag in tagged_words:
            mapping[tag].append(word)
        return mapping
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst)
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
项目:Intelligent-Phone-Salesman    作者:ShruthiChari    | 项目源码 | 文件源码
def postagger(sent):
    text = nltk.word_tokenize(sent)
    posTagged = pos_tag(text)
    #simplifiedTags = [map_tag('en-ptb', 'universal', tag) for word, tag in posTagged]
    return posTagged
项目:pythainlp    作者:PyThaiNLP    | 项目源码 | 文件源码
def tag(text):
    """
    ?????????? ''list'' ?????????? ''list'' ???? [('???????', '??????')]"""
    tagger = nltk.tag.UnigramTagger(model=data())# backoff=default_tagger)
    return tagger.tag(text)
项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def logmsg(s):
    # would be better to use python logger
    print>>sys.stderr, "[phrasemachine] %s" % s

############## SimpleNP
## Uses a five-tag coarse grammar.
## tagset: A D P N O

# Requires conversion from PTB or Petrov/Gimpel tags to our system.
# "Coarse*" indicates petrov/gimpel
# Grammar change from the FST version: can't repeat NUM in both adj and noun.
项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def coarse_tag_str(pos_seq):
    """Convert POS sequence to our coarse system, formatted as a string."""
    global tag2coarse
    tags = [tag2coarse.get(tag,'O') for tag in pos_seq]
    return ''.join(tags)

# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
项目:phrasemachine    作者:slanglab    | 项目源码 | 文件源码
def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed

        sents = self.sent_detector.tokenize(text)    # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
项目:Automatic-Question-Generation    作者:bwanglzu    | 项目源码 | 文件源码
def _ner_features(self, row):
        """Name entity recognition features
        - Args:
            row(pandas.dataframe): dataframe of current row
        - Returns:
            row(pandas.dataframe): result a pandas dataframe with new feature
        """
        answer = row.Answer
        question = row.Question
        if answer is not None and question is not None:
            sentence_len = len(row.Sentence.split())
            ners_answer = self.st.tag(answer.split())
            ners_question = self.st.tag(question.split())
            ner_values_answer = [v for k, v in ners_answer if v in [
                'PERSON', 'ORGANIZATION', 'LOCATION']]
            ner_values_question = [v for k, v in ners_question if v in [
                'PERSON', 'ORGANIZATION', 'LOCATION']]
        else:
            return None
        # NER IN ANSWER
        if 'PERSON' in ner_values_answer:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 1
        else:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_PERS'] = 0
        if 'ORGANIZATION' in ner_values_answer:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 1
        else:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_ORG'] = 0
        if 'LOCATION' in ner_values_answer:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 1
        else:
            row['NAMED_ENTITY_IN_ANSWER_COUNT_LOC'] = 0
        # NER IN QUESTION
        if 'PERSON' in ner_values_question:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 1
        else:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_PERS'] = 0
        if 'ORGANIZATION' in ner_values_question:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 1
        else:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_ORG'] = 0
        if 'LOCATION' in ner_values_question:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 1
        else:
            row['NAMED_ENTITY_OUT_ANSWER_COUNT_LOC'] = 0
        row['NUM_NAMED_ENTITIES_IN_ANSWER'] = len(ner_values_answer)
        row['NUM_NAMED_ENTITIES_OUT_ANSWER'] = len(ner_values_question)
        row['ANSWER_NAMED_ENTITY_DENSITY'] = float(
            len(ner_values_answer)) / sentence_len
        row['QUESTION_NAMED_ENTITY_DENSITY'] = float(
            len(ner_values_question)) / sentence_len
        return row
项目:seq2seq-keyphrase    作者:memray    | 项目源码 | 文件源码
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)
项目:Intelligent-Phone-Salesman    作者:ShruthiChari    | 项目源码 | 文件源码
def build_history(data_list, supported_tags_phones,supported_tags):
    history_list = [] # list of all histories
    sents = []
    count = 0
    expected = []

    for data in data_list: # data is the inputs entered by a given student
        data1 = data['data']

        #data1 is for every sentence entered by user
        for rec in data1:
            updates = rec['updates']
            sent = rec['sentence']  
            relatedTags=[]
            relations=[]
            if "rels" in rec.keys():
                relatedEntities = rec['rels']   
                expected.append(relatedEntities)            
                for i in relatedEntities:
                    relations.append(i.keys())
                    for j in i[i.keys()[0]]:
                        relatedTags.append(j)
            words = []
            posTaggedSent = postagger(sent)
            #chunkPhrases = chunker(sent)


            if len(updates) == len(posTaggedSent):
                for i in range(len(updates)):               
                    words.append({"word":updates[i]['word'],"pos":posTaggedSent[i],"tag":updates[i]['tag']})
                    #------------------------------------------------------------------------------------------------
                    # NOTE: below code is a temporary hack to build the MAxEnt for just 2 tags - we will change this later
                    if (updates[i]['tag'] not in supported_tags_phones):
                        if updates[i]['tag'] == "Model":
                            updates[i]['tag'] = "Version"
                        else:
                            updates[i]['tag'] = "Other"                
                    #------------------------------------------------------------------------------------------------

            sents.append(words)
            history={}
            history['sentence'] = words
            history['i'] = count+1
            #history['phrases'] = chunkPhrases
            history['relatedTags'] = relatedTags
            if len(relations) > 0:
                history_list.append((history,relations[0][0],))
            else:
                history_list.append((history,"None",))
            count += 1


    return (history_list,sents,expected)
项目:Intelligent-Phone-Salesman    作者:ShruthiChari    | 项目源码 | 文件源码
def chunker(sent):

#a = [("I","PRP"),("hear","VBP"),("Jerusalem","NNP"),("bells","NNS"),("ringing","VBG")]
#input_sent = " Rockwell said the agreement calls for it to supply 200 addititonal so-called shipsets for the planes."
    input_sent = sent 
    text = nltk.word_tokenize(input_sent)
    a = nltk.pos_tag(text)
    phrases = []

    tup = ()
    '''test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])'''
    NP_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    VP_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])
    class ChunkParser(nltk.ChunkParserI):
        def __init__(self, train_sents):
            train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
            self.tagger = nltk.TrigramTagger(train_data)
        def parse(self, sentence):
            pos_tags = [pos for (word,pos) in sentence]
            tagged_pos_tags = self.tagger.tag(pos_tags)
            chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
            conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
            return nltk.chunk.util.conlltags2tree(conlltags)

    NPChunker = ChunkParser(NP_sents)
    VPChunker = ChunkParser(VP_sents)
    #print (NPChunker.parse("I hear Jerusalem bells ringing"))
    parsed_sent = NPChunker.parse(a)
    for i in parsed_sent:
        if (type(i)!=type(tup)):
            l=[]
            for t in tuple(i):
                l.append(t[0])
            phrases.append({"NP":" ".join(l)})
    parsed_sent = VPChunker.parse(a)
    for i in parsed_sent:
            if (type(i)!=type(tup)):
                l=[]
                for t in tuple(i):
                    l.append(t[0])
                phrases.append({"VP":" ".join(l)})
    return phrases