Python nltk.tokenize 模块,sent_tokenize() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tokenize.sent_tokenize()

项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def read_folder(self, folder_name, number_of_files_to_read=10000):
        """
        Reads all files in a directory, splits them into sentences and puts these sentences in a list to return.
        Args:
            folder_name = the name of the folder to read files from
            number_of_files_to_read = optional parameter for how many files in a directory to read
        Returns:
            A list of all sentences from all text files in the folder
        """
        count = 0
        all_sentences = []
        for filename in os.listdir(folder_name):
            if filename.endswith(".txt") and count < number_of_files_to_read:
                main_text_to_open = folder_name + "/" + filename
                main_text = self.open_file_single_string(main_text_to_open)
                udata = main_text.decode("utf-8")
                main_text = udata.encode("ascii", "ignore")
                sentences = sent_tokenize(main_text)
                for sentence in sentences:
                    all_sentences.append(sentence)
            count += 1
        return all_sentences
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def create_batch(self, sentence_li):
        """Create a batch for a list of sentences."""

        embeddings_batch = []
        for sen in sentence_li:
            embeddings = []
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                embeddings.append(self.embdict.tok2emb.get(tok))
            if len(tokens) < self.max_sequence_length:
                pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
                embeddings = pads + embeddings
            else:
                embeddings = embeddings[-self.max_sequence_length:]
            embeddings = np.asarray(embeddings)
            embeddings_batch.append(embeddings)
        embeddings_batch = np.asarray(embeddings_batch)
        return embeddings_batch
项目:samnorsk    作者:gisleyt    | 项目源码 | 文件源码
def article_to_pairs(arg):
    article, direction = arg
    pairs = []

    if 'text' not in article:
        return []

    sents = sent_tokenize(article['text'], language='norwegian')
    translations = translate(sents, direction)

    for sent, trans in zip(sents, translations):
        trans_tokens = tokenize(trans)
        tokens = tokenize(sent)

        pairs += compare(tokens, trans_tokens)

    del article
    del sents
    del translations

    return pairs
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST
项目:medknow    作者:kbogas    | 项目源码 | 文件源码
def mmap_extract(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!

    Input:
        - text: str,
        a piece of text or sentence
    Output:
        - concepts: list,
        list of metamap concepts extracted
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)), 
                                         word_sense_disambiguation=True)
    if errors:
        print 'Errors with extracting concepts!'
        print errors
    return concepts
项目:SocialNPHS    作者:SocialNPHS    | 项目源码 | 文件源码
def person_connotation(tweet, name):
    """
    Decide whether a person is talked favorably about or not, based on the
    tone of the sentences in which their name appears
    """
    twtcontent = sent_tokenize(tweet)
    overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    mentions = 0
    # analyze each sentence talking about `name` person
    for s in twtcontent:
        tags = get_tweet_tags(s)
        # if the name appears in the tagged sentence, get its tone
        if (name, 'NNP') in tags:
            sentence = util.untag(tags)
            scores = tweet_connotation(' '.join(sentence))
            # add it up to the overall tweet's tone
            for i, z in enumerate(scores):
                overall[z] += scores[z]
            mentions += 1
    # averaging all sentences' scores. don't wanna divide by zero now do we
    if mentions != 0:
        for v in overall:
            overall[v] = round(overall[v] / mentions, 3)
    return overall
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def make_summaries():
    terms = Terms.objects.all()

    removals = ['DEFINITION', 'BREAKING DOWN', 'What is']

    for term in terms:
        try:
            summary = summarizer(term.text, settings.SUMMARIZER_SENTENCES)
            sentence_tokens = sent_tokenize(summary)
            text = ''
            for sentence in sentence_tokens:
                if not any(to_remove in sentence for to_remove in removals):
                    text += "{0} ".format(sentence.replace(r'\A[\d]\S\s', ''))

            term.summary = summarizer(text, settings.SUMMARIZER_SENTENCES)
            term.save()
        except Exception as e:
            print((coloredf.red("[ERROR] Ar terms summarizer: {0}".format(e))))
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def clean_video(video):
    text = []
    try:
        if len(video.description) > 0:
            sentence_tokens = sent_tokenize(video.description)

            for sentence in sentence_tokens:
                if not ('http' in sentence):
                    text.append("{0} ".format(sentence))

        video.description = "".join("{} ".format(s) for s in text)
        video.save()
        if settings.SHOW_DEBUG:
            print(colored.green("Cleaned video description saved to db: {0}".format(video.title)))
    except Exception as e:
        print(colored.red("At clean_video {}".format(e)))
项目:topicModelling    作者:balikasg    | 项目源码 | 文件源码
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        window = 150
#        doc = doc.replace("&ndash;", " ")
#        doc = sent_tokenize(doc)
        for sentence in doc:
            miniArray = []
            for term in sentence:
                id = self.term_to_id(term, training)    
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1
            if not len(miniArray):
                continue
            if len(miniArray)  > window:
                l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
            else:
                l.append(np.array(miniArray))
        return l
项目:delbot    作者:shaildeliwala    | 项目源码 | 文件源码
def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]
项目:LanguageAnalysis    作者:trideeprath    | 项目源码 | 文件源码
def parse_xml_language_similarity(file_read,file_write):
    count = 0
    with open(file_read,'r') as f, open(file_write,'w') as out:
        for line in f:
            count +=1
            if count %1000 == 0: print(count)
            if "row Id" in line:
                line = line.strip()
                root = xml.etree.ElementTree.fromstring(line)
                try:
                    body = remove_tags(root.get('Body'))
                    title = remove_tags(root.get('Title'))
                    body_sentences = sent_tokenize(body)
                    title_sentences = sent_tokenize(title)
                    for line in body_sentences:
                        out.write(line+"\n")
                    for line in title_sentences:
                        out.write(line+"\n")
                except:
                    continue
项目:markov_bot    作者:18F    | 项目源码 | 文件源码
def train(self, chain_len = None):
        """ Trains the markov data structure by creating chains of desired length """
        if not chain_len:
            chain_len = self.CHAIN_LENGTH

        self.CHAIN_LEN = chain_len

        self.everything['corpus'] = {}
        self.corpus = self.everything['corpus']

        for f in self.everything['input']:
            for line in sent_tokenize( self.everything['input'][f] ):
                words = word_tokenize(line)

                for chain in self._make_chains(words):
                    k = " ".join( chain[:-1] ) # key is everything but last word
                    v = chain[-1] # value is last word

                    try:
                        self.corpus[k].append(v)
                    except:
                        self.corpus[k] = [v]
项目:patentdata    作者:benhoyle    | 项目源码 | 文件源码
def nltk_extract_claims(text):
    """
    Attempts to extract claims as a list from a large text string.
    Uses nltk sent_tokenize function in tokenize library
    param string text: string containing several claims
    """
    sent_list = sent_tokenize(text)
    # On a test string this returned a list with the claim number
    # and then the claim text as separate items
    claims_list = []
    for i in range(0, len(sent_list), 2):
        try:
            number = int(sent_list[i].split(".")[0])
        except:
            number = 0

        claims_list.append(
            (number, sent_list[i+1])
        )

    return claims_list
项目:qas    作者:kusha    | 项目源码 | 文件源码
def check_sentence(text):
        """
        Check, that only one sentence was provided.

        >>> QASystem.check_sentence("Example sentence.")
        >>> QASystem.check_sentence("Example sentence. Another example.")
        Traceback (most recent call last):
        core.MultipleSentences: ['Example sentence.', 'Another example.']

        Args:
            text (str): provided question/answer.

        Returns:
            None

        Raises:
            MultipleSentenceQuestion: in case of more than one sentence inside
            of the text string.
        """
        sent_tokenize_list = sent_tokenize(text)  # nltk tokenize sentence
        if len(sent_tokenize_list) > 1:
            raise MultipleSentences(sent_tokenize_list)
项目:NN_sentiment    作者:hx364    | 项目源码 | 文件源码
def read_yelp(file_name='yelp_academic_dataset_review.json'):

    f = open(file_name)
    f = f.readlines()
    f = [eval(l.strip()) for l in f]
    stars = [i['stars'] for i in f]
    text = [i['text'] for i in f]

    df = pd.DataFrame()
    df['stars'] = stars
    df['text'] = text

    #compute the number of sentences in each doc
    l = list(df.text)
    text = [sent_tokenize(i) for i in list(df.text)]
    text_len = [len(i) for i in text]

    #2225188 in total
    #2089287 for length<=20
    #1654640 for length<=10
    #We decide to only consider length<=7 here
    df['length'] = text_len
    df['text_split'] = text
    return df
项目:DropMuse    作者:DropMuse    | 项目源码 | 文件源码
def get_sentiment(song):
    scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])

    if not song:
        return scores

    raw_text = song
    raw_text = re.sub("\n", ". ", str(raw_text))

    # Using already trained
    sid = SentimentIntensityAnalyzer()
    sentences = tokenize.sent_tokenize(raw_text)

    scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
    for sentence in sentences:

        ss = sid.polarity_scores(sentence)

        for k in sorted(ss):
            scores[k] += ss[k]

    return scores
项目:semeval2017-scienceie    作者:UKPLab    | 项目源码 | 文件源码
def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def parse_xml_all(self, data_file, doc_type, language='english'):
        e = ET.parse(data_file)
        cluster_data = {}
        root = e.getroot()
        for topics in root:
            data = []
            topic_id = topics.attrib.get('id')
            for documents in topics.findall(doc_type):
                doc_id = documents.attrib.get('id')
                if doc_type == 'document':
                    title_text = documents.find('title').text
                doc_text = documents.find('text').text
                text = text_normalization(doc_text)
                doc_sents = sent_tokenize(text, language)
                data.append([doc_id, doc_sents])
            cluster_data[topic_id] = data
        return cluster_data
项目:vismooc-data-server    作者:HKUST-VISLab    | 项目源码 | 文件源码
def analysis(self, paragraph):
        ''' analysis sentiment given paragraph
        '''
        result = 0
        counter = 0
        sentences = tokenize.sent_tokenize(paragraph)
        for sentence in sentences:
            sentiment = self.analyzer.polarity_scores(sentence)['compound']
            if sentiment > SentimentAnalyzer.neutral_threshold[0] and \
                sentiment < SentimentAnalyzer.neutral_threshold[1]:
                continue

            counter += 1
            result += sentiment

        result = result / float(counter) if counter > 0 else 0
        return result
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def add_items(self, sentence_li):
        """Add new items to the tok2emb dictionary from a given text."""

        for sen in sentence_li:
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                if self.tok2emb.get(tok) is None:
                    self.tok2emb[tok] = self.fasttext_model[tok]
项目:crypto-sentiment    作者:codingupastorm    | 项目源码 | 文件源码
def get_sentiment_from_paragraph(paragraph):
    sentence_list = tokenize.sent_tokenize(paragraph)
    paragraphSentiments = 0.0
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        paragraphSentiments += vs["compound"]
    return round(paragraphSentiments/len(sentence_list), 4)
项目:nazgul    作者:TartuNLP    | 项目源码 | 文件源码
def pre_processing(tokenizer, truecaser, info):
    # SPLIT THE WHITESPACES
    source_file_t = re.split('([\t\n\r\f\v]+)', info['src'])

    # SENTENCE TOKENIZE
    for i in range(len(source_file_t)):
        if i % 2 == 0:
            source_file_t[i] = sent_tokenize(source_file_t[i])

    # TOKENIZATION
    if info['tok']:
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    try:
                        source_file_t[j][i] = str(
                            tokenizer.tokenize(source_file_t[j][i], return_str=True).encode('utf-8'))
                    except NameError:
                        source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))

    # TRUECASING
    if info['tc']:
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    source_file_t[j][i] = str((truecasing(truecaser, source_file_t[j][i].split(' ')[0]).decode(
                        'utf-8') + " " + (' '.join(source_file_t[j][i].split(' ')[1:]).decode('utf-8'))).encode('utf-8'))
                    print source_file_t[j][i]

    # IF NEITHER
    if not (info['tc'] or info['tok']):
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    try:
                        source_file_t[j][i] = str(source_file_t[j][i].encode('utf-8'))
                    except NameError:
                        source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))

    return source_file_t
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def extractRawFrequencies(self, article):
        # this method is similar to above but returns
        # the raw freq.cies ( all word count)
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
项目:stock-eagle    作者:mtusman    | 项目源码 | 文件源码
def sentence(text):
    '''Break the text into sentences'''
    return sent_tokenize(text)
项目:ask_data_science    作者:AngelaVC    | 项目源码 | 文件源码
def getSentences(self):
        self.sentences = sent_tokenize(self.text)
项目:medknow    作者:kbogas    | 项目源码 | 文件源码
def metamap_wrapper(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!

    Input:
        - text: str,
        a piece of text or sentence
    Output:
       - a dictionary with key sents and values
       a list of the concepts found
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    # Load Metamap Instance
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)))
    # Keep the sentence ids
    ids = np.array([int(concept[0]) for concept in concepts])
    sentences = []
    for i in xrange(len(sents)):
        tmp = {'sent_id': i+1, 'entities': [], 'relations': []}
        # Wanted concepts according to sentence
        wanted = np.where(ids == i)[0].tolist()
        for w_ind in wanted:
            w_conc = concepts[w_ind]
            if hasattr(w_conc, 'cui'):
                tmp_conc = {'label': w_conc.preferred_name, 'cui': w_conc.cui, 
                            'sem_types': w_conc.semtypes, 'score': w_conc.score}
                tmp['entities'].append(tmp_conc)
        sentences.append(tmp)
    if errors:
        time_log('Errors with extracting concepts!')
        time_log(errors)
    return {'sents': sentences, 'sent_text':text}
项目:medknow    作者:kbogas    | 项目源码 | 文件源码
def reverb_wrapper(text, stop=None):
    """
    Function-wrapper for ReVerb binary. Extracts relations
    found in text.
    Input:
        - text: str,
        a piece of text or sentence
        - stop: list,
        list of stopwords to remove from the relations
    Output:
        - total: list,
        list of lists. Each inner list contains one relation in the form
        [subject, predicate, object]
    """
    total = []
    for sent in sent_tokenize(text):
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
        reverb_dir = settings['load']['path']['reverb']
        result = runProcess(cmd, reverb_dir)
        # Extract relations from reverb output
        result = result[-3:]
        result = [row.split('\t')[1].strip('\n') for row in result]
        # Remove common stopwords from relations
        if stop:
            result = [stopw_removal(res, stop) for res in result]
        total.append(result)
    # Remove empty relations
    total = [t for t in total if t]
    return total
项目:medknow    作者:kbogas    | 项目源码 | 文件源码
def extract_entities(text, json_={}):
    """
    Extract entities from a given text using metamap and
    generate a json, preserving infro regarding the sentence
    of each entity that was found. For the time being, we preserve
    both concepts and the entities related to them
    Input:
         - text: str,
        a piece of text or sentence
        - json_: dic,
        sometimes the json to be returned is given to us to be enriched
        Defaults to an empty json_
    Output:
        - json_: dic,
        json with fields text, sents, concepts and entities
        containg the final results
    """
    json_['text'] = text
    # Tokenize the text
    sents = sent_tokenize(text)
    json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
    json_['concepts'], _ = mmap_extract(text)
    json_['entities'] = {}
    for i, sent in enumerate(json_['sents']):
        ents = metamap_ents(sent)
        json_['entities'][sent['sent_id']] = ents
    return json_
项目:medknow    作者:kbogas    | 项目源码 | 文件源码
def reverb_wrapper(text, stop=None):
    """
    Function-wrapper for ReVerb binary. Extracts relations
    found in text.
    Input:
        - text: str,
        a piece of text or sentence
        - stop: list,
        list of stopwords to remove from the relations
    Output:
        - total: list,
        list of lists. Each inner list contains one relation in the form
        [subject, predicate, object]
    """
    total = []
    for sent in sent_tokenize(text):
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
        reverb_dir = settings['load']['path']['reverb']
        result = runProcess(cmd, reverb_dir)
        # Extract relations from reverb output
        result = result[-3:]
        result = [row.split('\t')[1].strip('\n') for row in result]
        # Remove common stopwords from relations
        if stop:
            result = [stopw_removal(res, stop) for res in result]
        total.append(result)
    # Remove empty relations
    total = [t for t in total if t]
    return total
项目:SocialNPHS    作者:SocialNPHS    | 项目源码 | 文件源码
def tweet_connotation(tweet):
    """ Decide whether a tweet is generally positive or negative """
    anlyzr = SentimentIntensityAnalyzer()
    # break tweet up into sentences and analyze each seperately
    twtcontent = sent_tokenize(tweet)
    overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    for s in twtcontent:
        scores = anlyzr.polarity_scores(s)
        # tally up each sentence's overall tone
        for i, z in enumerate(scores):
            overall[z] += scores[z]
    # average it all together for the tweet as a whole
    for v in overall:
        overall[v] = round(overall[v] / len(twtcontent), 3)
    return overall
项目:scrapyProject    作者:bedcode    | 项目源码 | 文件源码
def tokenize_into_opinion_units(text):
    output = []
    for str in sent_tokenize(text):
        for output_str in str.split(' but '):
            output.append(output_str)
    return output

#Take positive.csv and negative.csv and mix them into
#positiveandnegative.csv
#This has each unit tagged with its booking.com sentiment
#This is the data I tagged with Mechanical Turk
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def ask_confirmation(self,best_matching_action):
        alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
        alternative_formulation = choice(alternative_formulations)
        self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
        answer = self.active_listen()
        if "no" in answer:
            self.speak("Please reformulate your request.")
            return 0
        if "yes" in answer:
            self.speak("Very good")
            return 1
项目:Personal_AI_Assistant    作者:PratylenClub    | 项目源码 | 文件源码
def ask_confirmation(self,best_matching_action):
        alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
        alternative_formulation = choice(alternative_formulations)
        self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
        answer = self.active_listen()
        if "no" in answer:
            self.speak("Please reformulate your request.")
            return 0
        if "yes" in answer:
            self.speak("Very good")
            return 1
项目:Search-Engine    作者:SoufianEly    | 项目源码 | 文件源码
def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms)
项目:FYP-AutoTextSum    作者:MrRexZ    | 项目源码 | 文件源码
def convert_text2bin1(docs, writer):
        global counter
        for i, fi in enumerate(docs):
            with open(os.path.join(curdir,"input","cnn","stories",fi),'r', encoding="UTF-8") as f:
                wholetext=f.read().lower()
                wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
                wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
                wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
                wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
                wholetext=wholetext.replace("."," . ")
                wholetext=wholetext.replace(","," , ")
                wholetext=wholetext.replace('-',' - ')
                wholetext=wholetext.replace('?',' ? ')
                wholetext=wholetext.replace('(','( ')
                wholetext=wholetext.replace(')',' )')
                data=wholetext.split("@highlight")
                news=data[0]
                highlights=data[1].replace('\n\n','')
                news=(" ".join(news.split('\n\n'))).strip()
                sentences = sent_tokenize(news)
                news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
                highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
                words = (news+" "+highlights).split()
                counter.update(words)
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
                tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))
                if i%3000==0:
                    print(int((float(i)/ len(docs))*100), "%")
        print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
项目:FYP-AutoTextSum    作者:MrRexZ    | 项目源码 | 文件源码
def convert_text2bin2(docs, writer):
        global counter
        for i, fi in enumerate(docs):
            with open(os.path.join(curdir,"input","dailymail","stories",fi),'r', encoding="UTF-8") as f:
                wholetext=f.read().lower()
                wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
                wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
                wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
                wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
                wholetext=wholetext.replace("."," . ")
                wholetext=wholetext.replace(","," , ")
                wholetext=wholetext.replace('-',' - ')
                wholetext=wholetext.replace('?',' ? ')
                wholetext=wholetext.replace('(','( ')
                wholetext=wholetext.replace(')',' )')
                data=wholetext.split("@highlight")
                news=data[0]
                try:
                    news=news.split("updated:")[1]
                    news=news[news.find('20')+4:]
                except:
                    None
                news=(" ".join(news.split('\n'))).strip()
                highlights=data[1].replace('\n\n','')
                news=(" ".join(news.split('\n\n'))).strip()
                sentences = sent_tokenize(news)
                news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
                highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
                words = (news+" "+highlights).split()
                counter.update(words)
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
                tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))
                if i%3000==0:
                    print(int((float(i)/ len(docs))*100), "%")
        print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def text_cleaner(data):
    paragraphs_ = ""
    try:
        keep_endings = ['.', '?']

        removals_ = open(join(settings.BASE_DIR, "aggregator", 'data', 'stop_sentences.txt'), 'r')
        removals = [r.replace('\n', '') for r in removals_]

        if not (data is None):
            text = data.split('\n')
            paragraphs = []
            for p in text:
                if len(p) > settings.MINIMUM_PARAGRAPH:
                    paragraphs.append(p)

            for p in paragraphs:
                sentence_tokens = sent_tokenize(p)
                paragraph = ""
                for sentence in sentence_tokens:
                    if sentence[-1] in keep_endings:
                            if len(sentence) > settings.MINIMUM_SENTENCE:
                                #should remove most of the code:
                                if sentence[0].isupper():
                                    if not any(to_remove in sentence for to_remove in removals):
                                        #eliminate some bad ending strings:
                                        if not sentence.endswith(('e.g.', 'i.e.')):
                                            paragraph += "{0} ".format(sentence)
                paragraphs_ +=  "<p>{0}</p>".format(paragraph)
    except Exception as e:
        print(colored.red("At text_cleaner {}".format(e)))

    return paragraphs_
项目:context2vec    作者:orenmel    | 项目源码 | 文件源码
def write_paragraph_lines(paragraph_lines):
    paragraph_str = ' '.join(paragraph_lines)
    for sent in sent_tokenize(paragraph_str):
        if lowercase:
            sent = sent.lower()
        output_file.write(' '.join(word_tokenize(sent))+'\n')
项目:context2vec    作者:orenmel    | 项目源码 | 文件源码
def extract_target_context(self, paragraph, isolate_target_sentence):

        if isolate_target_sentence:
            for sent in sent_tokenize(paragraph):
                words, position = self.extract_context(sent)
                if words is not None:
                    break
        else:
            words, position = self.extract_context(paragraph)
        return words, position
项目:topicModelling    作者:balikasg    | 项目源码 | 文件源码
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        doc_sents = sent_tokenize(doc)
        for sentence in doc_sents:
            miniArray = []
            for term in sentence.split():
                id = self.term_to_id(term, training)
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
            l.append(np.array(miniArray, dtype=np.int32))
        return l
项目:textkit    作者:learntextvis    | 项目源码 | 文件源码
def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]
项目:markov_bot    作者:18F    | 项目源码 | 文件源码
def make_phrases(self, start = 1, end = None):
        if not end: end = start + 1

        for chain_len in range(start, end): # +1 because of the way range works
            self.phrases[chain_len] = []

            for f in self.everything['input']:
                for line in sent_tokenize( self.everything['input'][f] ):
                    words = word_tokenize(line)

                    for chain in self._make_chains(words, chain_len):

                        try:
#                           print "ERROR.0:", chain
                            chain = chain[:-1] # drop last item in chain as it's "value" for markov
                            chain = [c for c in chain if c is not None] # quick clean as None is breaking join
                        except: 
                            print "ERROR.1:", chain
#                           sys.exit(-1)

#                       print chain_len, " => ", chain

                        try:
                            self.phrases[chain_len].append(" ".join(chain) )
                        except:
                            print "ERROR.2:", chain
                            sys.exit(-1)

            return Counter( self.phrases[chain_len] )
项目:Stockeye    作者:anfederico    | 项目源码 | 文件源码
def buildGraph(text):
    vertices = [] 
    sentences = sent_tokenize(text, language='english')
    for sentence_raw in sentences:  
        sentence_processed = sub("[^a-zA-Z ]+", '', sentence_raw).lower()          
        words = word_tokenize(sentence_processed, language='english')
        vertices.append(vertex(sentence_raw, sentence_processed, words))

    for v1 in vertices:
        for v2 in vertices:
            if v1.order != v2.order:                
                v1.scores.append(overlap(v1.words, v2.words))
        v1.averageScores()
    return vertices
项目:twitter-sentiment    作者:words-sdsc    | 项目源码 | 文件源码
def updateSentiment(dbLoc, tableName):
    sid = SentimentIntensityAnalyzer()
    conn = sqlite3.connect(dbLoc)

    cursor = conn.execute("SELECT * from %s" % tableName)

    # Go through every sentence
    for row in cursor:
        text = cleanTweet(row[TWEET_INDEX])
        #blob = TextBlob(text)

        sent = 0.0
        count = 0
        sentList = tokenize.sent_tokenize(text)

        # Go through each sentence in tweet
        for sentence in sentList:
            count += 1
            ss = sid.polarity_scores(sentence)
            sent += ss['compound']  # Tally up the overall sentiment

        if count != 0:
            sent = float(sent / count)

        # Update into DB
        conn.execute("UPDATE " + tableName + " set SENTIMENT = ? where ID = ?", \
                (sent, row[ID_INDEX]))


    conn.commit()
    conn.close()
项目:twitter-sentiment    作者:words-sdsc    | 项目源码 | 文件源码
def getSentiment(tweet):
    sid = SentimentIntensityAnalyzer()
    tweet = cleanTweet(tweet)
    sent = 0.0
    count = 0
    sentList = tokenize.sent_tokenize(tweet)

    # Go through each sentence in tweet
    for sentence in sentList:
        count += 1
        ss = sid.polarity_scores(sentence)
        sent += ss['compound']  # Tally up the overall sentiment

    if count != 0:
        sent = float(sent / count)

    return sent

# Update the sentiment
项目:PySummarizer    作者:musikalkemist    | 项目源码 | 文件源码
def _preprocess(self, text):
        """ Return a list of lists. Each list is a preprocessed sentence of 
            text in bag-of-words format."""

        stemmer = PorterStemmer()
        self._sents = sent_tokenize(text)
        # tokenize sentences
        word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
        # remove stop-words and stem words
        word_sents = [[stemmer.stem(word) for word in sent if 
            word not in self._stopwords] for sent in word_sents]
        return word_sents
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
        print "text_to_sentence"
        #from nltk.tokenize import wordpunct_tokenize
        # Function to split a review into parsed sentences. Returns a 
        # list of sentences, where each sentence is a list of words
        #
        text=text.decode("utf8")
        from nltk.tokenize import sent_tokenize,wordpunct_tokenize
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        #raw_sentences = tokenizer.tokenize(text.strip())
        raw_sentences = sent_tokenize(text.strip())
        print "finish tokenize sentence",len(raw_sentences)
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:

            #print "sentence:",raw_sentence
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                #sentences.append( text_to_wordlist( raw_sentence, \
    #               remove_stopwords ))
                #print removePunctuation(raw_sentence).lower().split()
                print raw_sentence
                sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
                print wordpunct_tokenize(raw_sentence)
                #print  text_to_wordlist( raw_sentence, remove_stopwords )
        #    
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences
项目:e2e-coref    作者:kentonl    | 项目源码 | 文件源码
def create_example(text):
  raw_sentences = sent_tokenize(text)
  sentences = [word_tokenize(s) for s in raw_sentences]
  speakers = [["" for _ in sentence] for sentence in sentences]
  return {
    "doc_key": "nw",
    "clusters": [],
    "sentences": sentences,
    "speakers": speakers,
  }
项目:django-summarizer    作者:zsharique    | 项目源码 | 文件源码
def getSentences(paragraph):
    """
    Extracts sentences from a paragraph
    :param paragraph: (str) paragraph text
    :returns: list of sentences
    """
    indexed = {}
    i = 0
    sentenceList = tokenize.sent_tokenize(paragraph)
    for s in sentenceList:
        indexed[i] = s
        i += 1
    return sentenceList, indexed