Python nltk 模块,sent_tokenize() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.sent_tokenize()

项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def tokenize_and_stem(text):
    """
    First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    """
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'intern' == token:
                token = ''
            if 'student' == token:
                token = ''
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def get_sentence_tokens(text):
    '''
    Given a text(review), return the token list of each sentence
    :param text:
    :return:
    '''
    sentences = sent_tokenize(text)

    sent_tokens = []
    for sentence in sentences:
        sent_token = word_tokenize(sentence)
        sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
        sent_tokens.append(sent_token)
    # remove stop words and short tokens

    # stemming, experiment shows that stemming works nothing...
    # if (stemming):
    #     stemmer = PorterStemmer()
    #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
    return sent_tokens
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        )
项目:skills-ml    作者:workforce-data-initiative    | 项目源码 | 文件源码
def ie_preprocess(self, document):
        """This function takes raw text and chops and then connects the process to break
           it down into sentences"""

        # Pre-processing
        # e.g.","exempli gratia"
        document = document.replace("e.g.", "exempli gratia")

        # Sentence tokenizer out of nltk.sent_tokenize
        split = re.split('\n|\*', document)

        # Sentence tokenizer
        sentences = []
        for sent in split:
            sents = nltk.sent_tokenize(sent)
            length = len(sents)
            if length == 0:
                next
            elif length == 1:
                sentences.append(sents[0])
            else:
                for i in range(length):
                    sentences.append(sents[i])
        return sentences
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def maybe_build_sentences(text_filename, sent_filename):
    sents = []
    if os.path.exists(sent_filename):
        fsent = open(sent_filename, "rb")
        for line in fsent:
            docid, sent_id, sent = line.strip().split("\t")
            sents.append(sent)
        fsent.close()
    else:
        ftext = open(text_filename, "rb")
        fsent = open(sent_filename, "wb")
        for line in ftext:
            docid, text = line.strip().split("\t")
            sent_id = 1
            for sent in nltk.sent_tokenize(text):
                sents.append(sent)
                fsent.write("{:d}\t{:d}\t{:s}\n"
                    .format(int(docid), sent_id, sent))
                sent_id += 1
        fsent.close()
        ftext.close()
    return sents
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def maybe_build_sentences(text_filename, sent_filename):
    sents = []
    if os.path.exists(sent_filename):
        fsent = open(sent_filename, "rb")
        for line in fsent:
            docid, sent_id, sent = line.strip().split("\t")
            sents.append(sent)
        fsent.close()
    else:
        ftext = open(text_filename, "rb")
        fsent = open(sent_filename, "wb")
        for line in ftext:
            docid, text = line.strip().split("\t")
            sent_id = 1
            for sent in nltk.sent_tokenize(text):
                sents.append(sent)
                fsent.write("{:d}\t{:d}\t{:s}\n"
                    .format(int(docid), sent_id, sent))
                sent_id += 1
        fsent.close()
        ftext.close()
    return sents
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def get_review_sentences():
    '''
    Read the yelp review and return after sentence segmentattion
    :return:
    '''
    review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
    count_sentence = 0
    sentences = []

    for line in review_file:
        json_review = json.loads(line.strip())
        text = json_review.get("text").replace('\n','').lower()

        raw_sentences = sent_tokenize(text)
        for raw_sentence in raw_sentences:
            if len(raw_sentence.strip()) > 0:
                sent_tokens = word_tokenize(raw_sentence)
                sentences.append(sent_tokens)
    return sentences
项目:NLP-Keyword-Extraction-Ensemble-Method    作者:Ashwin-Ravi    | 项目源码 | 文件源码
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
项目:Medical_NER    作者:murhafh    | 项目源码 | 文件源码
def print_symptoms_from_page(url = '', model = '', stanford_jar = ''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
项目:keyphrase-extraction    作者:sagarchaturvedi1    | 项目源码 | 文件源码
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    ''' This function will extract text of a specific POS sequence rather than just Noun Phrase '''

    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group)
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]
项目:ai-chatbot-framework    作者:alfredfrancis    | 项目源码 | 文件源码
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def process(self, fc, context=None):
        text_source = self.config.get('text_source')
        if text_source and text_source in fc:
            text = fc[text_source]
        else:
            return fc
        names = defaultdict(StringCounter)
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    label = chunk.label()
                    name = ' '.join(c[0] for c in chunk.leaves())
                    if not isinstance(name, unicode):
                        name = unicode(name, 'utf-8')
                    name = cleanse(name)
                    #print chunk.node, name
                    names[label][name] += 1
        for entity_type, name_counts in names.items():
            fc[entity_type] = name_counts
        return fc
项目:dialog_research    作者:wjbianjason    | 项目源码 | 文件源码
def generate_vocab(filename,min_fre=5,prefix=""):
    vf = open("../data/"+prefix+"vocab_generate.txt",'w')
    word = {}
    for line in file(filename):
      line = line.strip()
      try:
        sentencesToken = nltk.sent_tokenize(line)
      except:
        continue
      for i in range(len(sentencesToken)):
          tokens = nltk.word_tokenize(sentencesToken[i])
          for token in tokens:
              word.setdefault(token,0)
              word[token] += 1
    for char,num in sorted(word.items(),key=lambda x:x[1],reverse=True):
      if num < min_fre:
        break
      vf.write(char+" "+str(num)+"\n")
项目:kpex    作者:christophfeinauer    | 项目源码 | 文件源码
def extract_chunks(text_string,max_words=3,lemmatize=False):

    # Any number of adjectives followed by any number of nouns and (optionally) again
    # any number of adjectives folowerd by any number of nouns
    grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'

    # Makes chunks using grammar regex
    chunker = nltk.RegexpParser(grammar)

    # Get grammatical functions of words
    # What this is doing: tag(sentence -> words)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string))

    # Make chunks from the sentences, using grammar. Output in IOB.
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))
    # Join phrases based on IOB syntax.
    candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key]

    # Filter by maximum keyphrase length
    candidates = list(filter(lambda l: len(l.split()) <= 3, candidates))

    # Filter phrases consisting of punctuation or stopwords
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates))

    # lemmatize
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize
        candidates =  [lemmatizer(x) for x in candidates]

    return candidates
项目:Python-NLTKWebApp    作者:alibolek    | 项目源码 | 文件源码
def tokenizer():
    if len(request.vars)!=0:
        user_input=request.vars
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        if user_input.parameter=="sentence":

            our_output=nltk.sent_tokenize(user_input.input,"english")
            print user_input
            if request.vars.filename!='' and len(request.vars.filename.value)!="":
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")
            print our_output
        else:
            our_output=nltk.word_tokenize(user_input.input,"english")
            if request.vars.filename!='' and len(request.vars.filename.value)!=None:
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")


        user_input.output=our_output


    return locals()
项目:Python-NLTKWebApp    作者:alibolek    | 项目源码 | 文件源码
def tokenizer():
    if len(request.vars)!=0:
        user_input=request.vars
        import sys
        reload(sys)
        sys.setdefaultencoding('utf-8')
        if user_input.parameter=="sentence":

            our_output=nltk.sent_tokenize(user_input.input,"english")
            print user_input
            if request.vars.filename!='' and len(request.vars.filename.value)!="":
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")
            print our_output
        else:
            our_output=nltk.word_tokenize(user_input.input,"english")
            if request.vars.filename!='' and len(request.vars.filename.value)!=None:
                file_input=user_input.filename.value
                file_output=nltk.word_tokenize(file_input,"english")


        user_input.output=our_output


    return locals()
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    filters = [r'data documentation.*?shared']
    for sentence in nltk.sent_tokenize(text):
        match = search_any(filters, sentence)
        if match:
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            value_result = "Yes"
            return (value_text, value_result, source_type, source_detail)
    #if no match found:
    source_type = "extracted"
    source_detail = "nltk search v1"
    value_text = "Not Found"
    value_result = "No"
    return (value_text, value_result, source_type, source_detail)
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    for sentence in nltk.sent_tokenize(text):
        if search_any([r'data mine.*?source', r'text mine.*?shared'], sentence):
            # yapf: disable
            match = search_any([
                "data mine.*?(\w*\d[\w\d/-]*)",
                "text mine.*?(\w*\d[\w\d/-]*)"
            ], sentence)
            # yapf: enable
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            try:
                value_result = match.group(1).strip()
                return (value_text, value_result, source_type, source_detail)
            except AttributeError:  # no match was found
                return None
    return None
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract(text, paper=None, logger=logger):

    search_any = functools.partial(re_util.search_any, logger=logger)
    if not text and paper:
        try:
            text, _ = paper.get_text()
        except pdfutil.pdfutil.MalformedPDF as e:
            return None
    filters = [r'analys(is|es)']
    for sentence in nltk.sent_tokenize(text):
        match = search_any(filters, sentence)
        if match and search_any([r'algorithm', r'summary', r'outline', r'statistic', r'table|graph', r'following'], sentence):
            source_type = "extracted"
            source_detail = "nltk search v1"
            value_text = sentence
            value_result = "Yes"
            return (value_text, value_result, source_type, source_detail)
    #if no match found:
    source_type = "extracted"
    source_detail = "nltk search v1"
    value_text = "Not Found"
    value_result = "No"
    return (value_text, value_result, source_type, source_detail)
项目:dl-models-for-qa    作者:sujitpal    | 项目源码 | 文件源码
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs"
                    .format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def build_numeric_text(vocab, text):
    wids = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            wids.append(vocab[word])
    return ",".join([str(x) for x in wids])


##################### main ######################
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
项目:NLP    作者:Deamon5550    | 项目源码 | 文件源码
def split_sentences(text):
    """
    Returns a list of the sentences in the text that is passed in.
    """
    return sent_tokenize(text)
项目:kaggle_redefining_cancer_treatment    作者:jorgemf    | 项目源码 | 文件源码
def tokenize_documents(documents):
    for document in documents:
        text = document.text
        tokenized_doc = []
        for sent in nltk.sent_tokenize(text):
            tokenized_doc += nltk.word_tokenize(sent)
        document.text = tokenized_doc
项目:vanilla-neural-nets    作者:cavaunpeu    | 项目源码 | 文件源码
def _tokenize_corpus_into_list_of_tokenized_sentences(cls, corpus):
        tokenized_corpus = nltk.sent_tokenize(corpus)
        tokenized_corpus = [cls._clean_sentence(sentence) for sentence in tokenized_corpus]
        return [nltk.word_tokenize(sentence) for sentence in tokenized_corpus]
项目:resume-optimizer    作者:mhbuehler    | 项目源码 | 文件源码
def extract(self, text, max_length=3, metric='avg', incl_scores=False):
        """Extract keywords and keyphrases from input text in descending order of score"""
        sentences = nltk.sent_tokenize(text)
        phrase_list = self._generate_candidate_keywords(sentences, max_length=max_length)
        word_scores = self._calculate_word_scores(phrase_list)
        phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores, metric=metric)
        sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True)
        n_phrases = len(sorted_phrase_scores)

        if incl_scores:
            return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
        else:
            return map(lambda x: x[0], sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])
项目:Deep-Learning-with-Theano    作者:PacktPublishing    | 项目源码 | 文件源码
def parse_text(filename, vocabulary_size=9000, type="word"):
    with open(filename, 'rb') as f:
        txt = f.read()
        if type == "word":
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            # sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
            tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
            word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
            print("Found %d unique words tokens." % len(word_freq.items()))
            vocab = word_freq.most_common(vocabulary_size-1)
            index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab]
            word_to_index = dict([(w,i) for i,w in enumerate(index)])
            print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
            for i, sent in enumerate(tokenized_sentences):
                tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
            X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences])
            y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences])
            # X_train, y_train = [], []
            # for sent in tokenized_sentences:
            #     l = len(sent) - 1
            #     X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray())
            #     y_train.append( [word_to_index[w] for w in sent[1:] ] )
        else:
            sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' '))
            index = ['^','$'] + list(set(txt))
            char_to_index = dict([(w,i) for i,w in enumerate(index)])
            X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent]  for sent in sentences])
            y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences])

    return X_train, y_train, index
项目:Chinese-QA    作者:distantJing    | 项目源码 | 文件源码
def word_tokenize(tokens):
#   return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]

#from my.corenlp_interface import CoreNLPInterface
#url = 'vision-server2.corp.ai2'
#port = 8000
#interface = CoreNLPInterface(url, port)
#sent_tokenize = interface.split_doc
#word_tokenize = interface.split_sent
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def get_sentences_nltk(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    sentences = [s.lower() for s in nltk.sent_tokenize(text) if s]
    return sentences
项目:newsname-match    作者:bahadasx    | 项目源码 | 文件源码
def performNameExtraction(text):
    #Returns a list of what NLTK defines as persons after processing the text passed into it.
    try:
        entity_names = []
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label') and chunk.label:
                        if chunk.label() == 'PERSON':
                        name_value = ' '.join(child[0] for child in chunk.leaves())
                        if name_value not in entity_names:
                            entity_names.append(name_value)
    except:
        print "Unexpected error:", sys.exc_info()[0]
    return entity_names
项目:FYP-AutoTextSum    作者:MrRexZ    | 项目源码 | 文件源码
def tokenizeSentence(args):
    document = args['sentences']
    tokenized_sentences = nltk.sent_tokenize(document)
    return jsonify(tokenized_sentences)
项目:FYP-AutoTextSum    作者:MrRexZ    | 项目源码 | 文件源码
def tokenizeWord(args):
    document = args['sentences']
    tokenized_sentences = nltk.tokenize.sent_tokenize(document)
    tokenized_words = nltk.word_tokenize(tokenized_sentences[0])
    return jsonify(tokenized_words)
项目:Hacker_News_Article_Topics    作者:reeddunkle    | 项目源码 | 文件源码
def tokenize_individual_text(raw_text):
    '''
    Given raw_text, a string, return a list of tokens.
    '''

    return sum(map(nltk.word_tokenize, nltk.sent_tokenize(raw_text)), [])
项目:SentiCR    作者:senticr    | 项目源码 | 文件源码
def handle_negation(comments):
    sentences = nltk.sent_tokenize(comments)
    modified_st=[]
    for st in sentences:
        allwords = nltk.word_tokenize(st)
        modified_words=[]
        if negated(allwords):
            part_of_speech = nltk.tag.pos_tag(allwords,tagset='universal')
            chunked = chunk_parser.parse(part_of_speech)
            #print("---------------------------")
            #print(st)
            for n in chunked:
                if isinstance(n, nltk.tree.Tree):
                    words = [pair[0] for pair in n.leaves()]
                    #print(words)

                    if n.label() == 'NegP' and negated(words):
                        for i, (word, pos) in enumerate(n.leaves()):
                            if (pos=="ADV" or pos=="ADJ" or pos=="VERB") and (word!="not"):
                                modified_words.append(prepend_not(word))
                            else:
                                modified_words.append(word)
                    else:
                         modified_words.extend(words)
                else:
                    modified_words.append(n[0])
            newst =' '.join(modified_words)
            #print(newst)
            modified_st.append(newst)
        else:
            modified_st.append(st)
    return ". ".join(modified_st)
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def scored_document_phrases(documents, segmented=True):

    # If documents are not segmented and tagged, do so.
    if not segmented:
        documents = [
            nltk.sent_tokenize(document)
            for document in documents
        ]

    # Compose the documents as a list of their keyphrases
    documents = [
        list(extract_candidate_phrases(document, tagged=segmented))
        for document in documents
    ]

    # Create a lexicon of candidate phrases
    lexicon = gensim.corpora.Dictionary(documents)

    # Vectorize the documents by phrases for scoring
    vectors = [
        lexicon.doc2bow(document)
        for document in documents
    ]

    # Create the TF-IDF Model and compute the scores
    model = gensim.models.TfidfModel(vectors)
    scores = model[vectors]

    for doc in scores:
        yield [
            (lexicon[vec], score) for vec, score in doc
        ]
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def preprocess(text):
    return [
        [
            list(nltk.pos_tag(nltk.word_tokenize(sent)))
            for sent in nltk.sent_tokenize(para)
        ] for para in text.split("\n\n")
    ]
项目:Neural-Chatbot    作者:saurabhmathur96    | 项目源码 | 文件源码
def augment(pair):
    # convert single pair into multiple pairs
    question, answer = map(sent_tokenize, pair)
    q_sents = list(reversed(question))
    for _ in range(len(q_sents)):
        a_sents = answer[:]
        for _ in range(len(a_sents)):
            yield (' '.join(reversed(q_sents)), ' '.join(a_sents))
            a_sents.pop()
        q_sents.pop()
项目:factable    作者:eliucs    | 项目源码 | 文件源码
def factAnalysis(text):
    '''
    Goes through text, tokenizes by sentence and returns
    a tuple containing a boolean representing if overall the
    text is real or fake, a confidence score determined by
    the amount of votes against the verdict and list of tuples
    containing the specific sentence and boolean representing
    if the sentence is true or fake

    :return: tuple of boolean, double, and list
    '''
    text = sent_tokenize(text)
    trueCount = 0
    falseCount = 0

    sentenceLabels = []
    for sentence in text:
        features = findFeatures(sentence)
        if voteClassifier.classify(features):
            trueCount += 1
            sentenceLabels.append((sentence, True))
        else:
            falseCount += 1
            sentenceLabels.append((sentence, False))

    if not sentenceLabels:
        return False, False, False
    elif trueCount > falseCount:
        return True, 1 - falseCount/trueCount, sentenceLabels
    else:
        return False, 1 - trueCount/falseCount, sentenceLabels
项目:NLP-Preprocessing    作者:boost-starai    | 项目源码 | 文件源码
def getSentences(corpus):
    '''tokenize the corpus into sentences'''
    sentences = nltk.sent_tokenize(corpus)
    sentences = [removePunctuations(sentence) for sentence in sentences]
    return sentences
项目:poetic-inner-join    作者:emdaniels    | 项目源码 | 文件源码
def split_text(filename):
    with open(filename, 'rU') as f:
        reader = csv.reader(f, skipinitialspace=True)
        reader.next()
        # extra decoding to account for non UTF-8 characters
        sentences = itertools.chain(*[nltk.sent_tokenize(
            x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower()) for
                                      x in reader])
    return sentences
项目:poetic-inner-join    作者:emdaniels    | 项目源码 | 文件源码
def split_sentences(self):
        print("Reading CSV file...")
        with open(self.train_file, 'rU') as f:
            reader = csv.reader(f, skipinitialspace=True)
            reader.next()
            # extra decoding to account for non UTF-8 characters
            self.sentences = itertools.chain(*[nltk.sent_tokenize(
                x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower())
                                               for x in reader])
            self.sentences = ["%s %s %s" % (
            self.sentence_start_token, x, self.sentence_end_token) for x in
                              self.sentences]
        print("Parsed %d sentences." % (len(self.sentences)))
项目:Diggly-Back-End    作者:WikiDiggly    | 项目源码 | 文件源码
def __get_sentences(self, content, length):
        sentences = nltk.sent_tokenize(content.decode('utf-8'))
        res = " ".join(sentences[0:length])
        return res
项目:cvscan    作者:skcript    | 项目源码 | 文件源码
def fetch_name(resume_text):
  tokenized_sentences = nltk.sent_tokenize(resume_text)
  for sentence in tokenized_sentences:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
      if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON':
        chunk = chunk[0]
      (name, tag) = chunk
      if tag == 'NOUN':
        return name

  return "Applicant name couldn't be processed"
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def maybe_build_vocab(reuters_dir, vocab_file):
    vocab = collections.defaultdict(int)
    if os.path.exists(vocab_file):
        fvoc = open(vocab_file, "rb")
        for line in fvoc:
            word, idx = line.strip().split("\t")
            vocab[word] = int(idx)
        fvoc.close()
    else:
        counter = collections.Counter()
        num_docs_read = 0
        for doc in stream_reuters_documents(reuters_dir):
            if num_docs_read % 100 == 0:
                print("building vocab from {:d} docs"
                    .format(num_docs_read))
            topics = doc["topics"]
            if len(topics) == 0:
                continue
            title = doc["title"]
            body = doc["body"]
            title_body = ". ".join([title, body]).lower()
            for sent in nltk.sent_tokenize(title_body):
                for word in nltk.word_tokenize(sent):
                    counter[word] += 1
            for i, c in enumerate(counter.most_common(VOCAB_SIZE)):
                vocab[c[0]] = i + 1
            num_docs_read += 1
        print("vocab built from {:d} docs, complete"
            .format(num_docs_read))
        fvoc = open(vocab_file, "wb")
        for k in vocab.keys():
            fvoc.write("{:s}\t{:d}\n".format(k, vocab[k]))
        fvoc.close()
    return vocab
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def build_numeric_text(vocab, text):
    wids = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            wids.append(vocab[word])
    return ",".join([str(x) for x in wids])


##################### main ######################