Python sklearn.feature_extraction.text 模块,CountVectorizer() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.CountVectorizer()

项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def getTFV(token_pattern = token_pattern,
           norm = tfidf__norm,
           max_df = tfidf__max_df,
           min_df = tfidf__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range, use_idf=True, 
                         smooth_idf=True, sublinear_tf=True,
                         stop_words = stop_words, norm=norm, vocabulary=vocabulary)
    return tfv   


#========= CountVectorizer =========#
项目:Papyrus--simple-but-effective-text-summarization-tool    作者:RebeccaMerrett    | 项目源码 | 文件源码
def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def getBOW(token_pattern = token_pattern,
           max_df = bow__max_df,
           min_df = bow__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range,
                         stop_words = stop_words, vocabulary=vocabulary)
    return bow     


########################################################

# ------------------------------
# Simple text cleaning using 
#        
#     -replacement dict 
#        
#        or
#        
#     -WordReplacer object
#--------------------------------
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def predict_job(job_list):
    """Assign a classification to a url"""
    # TODO: Add case where len is 1 or 0....
    job_list = [job for j in job_list for job in j]
    new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
    new_job_list = [' '.join(job) for job in new_job_list]
    vect = CountVectorizer()
    x_series = pd.Series(X)
    X_train_dtm = vect.fit_transform(x_series)
    y_train = pd.Series(y)
    job_list_series = pd.Series(new_job_list)
    job_list_dtm = vect.transform(job_list_series)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred = nb.predict(job_list_dtm)
    # for i in range(len(job_list)):
    #     print(job_list[i], y_pred[i])
    return y_pred

# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def run(self):
        all_file_names = []
        all_labels = []

        for n, folder_name in enumerate(os.listdir(self.in_txtdir().path)):

            full_folder_name = self.in_txtdir().path+'/'+folder_name

            if os.path.isfile(full_folder_name):
                continue

            for file_name in os.listdir(full_folder_name):
                all_labels.append(n)
                all_file_names.append(full_folder_name+'/'+file_name)

        vectorizer = CountVectorizer(input='filename')
        vector = vectorizer.fit_transform(all_file_names)
        numpy.save(self.out_npy().path,vector)
        numpy.save('labels',numpy.array(all_labels)) #Where and how do we want to save this?

#This is just to test the tasks above
项目:SNAP_R    作者:zerofox-oss    | 项目源码 | 文件源码
def gen_lstm_status(screen_name, timeline, short_url, depth):
    # Create a vector of words and their frequency in on the user's timeline.
    # Experimentation shows that requiring a word to occur at least 4 * depth
    # times to be considered gives good results.
    with open("stopwords.txt", 'r') as stopwords_file:
        stopwords = [line.strip() for line in stopwords_file]
    processed_timeline_text = [preprocess_post(post) for post in timeline]

    vectorizer = CountVectorizer(min_df=4*depth, stop_words=stopwords)
    X = vectorizer.fit_transform(processed_timeline_text)
    vocab = vectorizer.get_feature_names()
    topic = random.choice(vocab)

    # Generates a status using a helper bash script.
    proc = subprocess.Popen([NN_SAMPLE_COMMAND, topic], stdout=subprocess.PIPE)
    status = topic + " " + proc.stdout.read().split("\n")[-2].strip()
    return "@" + screen_name + " " + status + " " + short_url
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def count_features(self,X,verbose=False):
        '''
        ???????????????????????????
        X?dataframe??columns????self.columns?
        ???????????self.estimators_??????????dataframe?index?X?columns?self.columns????????
        '''
        result=[]
        for i,estimator in enumerate(self.estimators_):
            tmp=pd.Series(estimator.apply(X[self.columns]))
            tmp.index=X.index
            tmp=tmp.map(lambda xx: ' '.join([yy[0] for yy in self.paths[i][xx]]))
            vect=CountVectorizer(vocabulary=self.columns,lowercase=False)
            tmp=vect.transform(tmp).toarray()
            tmp=pd.DataFrame(tmp)
            vocabulary_inverse={vect.vocabulary_[key]:key for key in vect.vocabulary_}
            tmp.columns=[vocabulary_inverse[k] for k in range(tmp.shape[1])]
            tmp.index=X.index
            tmp.index.name=X.index.name
            tmp=tmp.fillna(0)
            result.append(tmp.copy())
            if verbose:
                print('Done:',i)
        return result
项目:uci-statnlp    作者:sameersingh    | 项目源码 | 文件源码
def textToTokens(text):
    """Converts input string to a corpus of tokenized sentences.

    Assumes that the sentences are divided by newlines (but will ignore empty sentences).
    You can use this to try out your own datasets, but is not needed for reading the homework data.
    """
    corpus = []
    sents = text.split("\n")
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(sents)
    tokenizer = count_vect.build_tokenizer()
    for s in sents:
        toks = tokenizer(s)
        if len(toks) > 0:
            corpus.append(toks)
    return corpus
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def _vectorize_documents(self,method='tfidf',max_features=100):
        stop_words = []

        try:
            for lexicon_id in self.params['cluster_lexicons']:
                lexicon = Lexicon.objects.get(id=int(lexicon_id))
                words = Word.objects.filter(lexicon=lexicon)
                stop_words+=[word.wrd for word in words]
        except:
            KeyError

        if method == 'count':
            vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
        if method == 'tfidf':
            vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)

        document_vectors = vectorizer.fit_transform(self.documents)
        document_vectors = document_vectors.toarray()

        return document_vectors,vectorizer.get_feature_names()
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self, match_fn=TermMatch, binary=True, dtype=np.bool_,
                 **cv_params):
        """initializes a Matching object

        :match_fn: A matching function of signature `docs, query`
                    -> indices of matching docs
        :binary: Store only binary term occurrences.
        :dtype: Data type of internal feature matrix
        :cv_params: Parameter for the count vectorizer such as lowercase=True

        """
        # RetrievalBase.__init__(self)

        self._match_fn = match_fn
        self._vect = CountVectorizer(binary=binary, dtype=dtype,
                                     **cv_params)
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def is_embedded(sentence, embedding, analyzer):
    """
    >>> embedding = ["a", "b", "c"]
    >>> queries =  ["a b c", "a", "b", "c", "a b c d", "d", "a b c"  ]
    >>> analyzer = lambda x: x.split()
    >>> [query for query in queries if is_embedded(query, embedding, analyzer)]
    ['a b c', 'a', 'b', 'c', 'a b c']
    >>> analyzer = CountVectorizer().build_analyzer()
    >>> [query for query in queries if is_embedded(query, embedding, analyzer)]
    ['a b c', 'a', 'b', 'c', 'a b c']
    """
    for word in analyzer(sentence):
        if word not in embedding:
            print("Dropping:", sentence, file=sys.stderr)
            return False

    return True
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
项目:BotValue-public    作者:arnauddelaunay    | 项目源码 | 文件源码
def train_feature_finder(self, training_db, clf):
        training_sentences = []
        c = 0
        training_classes = []
        self.class_names = []
        self.vectorizer = CountVectorizer(analyzer = "word",   \
                              tokenizer = None,    \
                              preprocessor = None, \
                              stop_words = None,   \
                              max_features = 500)
        for key, value in training_db.iteritems():
            training_sentences += value
            training_classes += [c for i in range(len(value))] 
            c+=1
            self.class_names.append(key)
        train_data_features = self.vectorizer.fit_transform(training_sentences)
        train_data_features = train_data_features.toarray()
        clf = clf.fit( train_data_features, training_classes)
        return clf
项目:nlp-chinese_text_classification    作者:iamiamn    | 项目源码 | 文件源码
def getDatas(dataset_dir_name):
    movie_reviews = load_files(dataset_dir_name)

    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)

    #word_tokenizer ??????????????????????????????????????????????????
    vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
    word_tokenizer = vectorizer.build_tokenizer()


    #????????list
    doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
    doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)


    return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
项目:data_programming    作者:kep1616    | 项目源码 | 文件源码
def run():
    py2neo.authenticate("localhost:7474","neo4j","neo4j1")
    graph = Graph("http://localhost:7474/db/data/")
    result=graph.data('''MATCH (n:Product)-[r:BELONGS_TO]->(c:Category) WITH n, rand() AS number RETURN n.name,n.description,n.catName order by number limit 3000''')
    st = ""

    for x in result:
        p=','.join(str(val).strip(string.punctuation) for (key,val) in x.items())
        st=st + p
        p=""
    vectorizer = CountVectorizer(strip_accents='ascii')
    tokenizer = vectorizer.build_tokenizer()
    preprocessor = vectorizer.build_preprocessor()

    tokens = set()


    for item in tokenizer(st):
        tokens.add(preprocessor(item))

    with codecs.open(path_config.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f:
        for token in tokens:
            f.write(token + '\n')
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)
项目:Bayes    作者:krzjoa    | 项目源码 | 文件源码
def get_data():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    vectorizer = CountVectorizer()

    categories = ['alt.atheism', 'talk.religion.misc',
                  'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories, shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories, shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test
项目:document_classification    作者:scotthlee    | 项目源码 | 文件源码
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
        #choosing the particular flavor of vectorizer
        if method == 'counts':
            vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
        elif method == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')

        #fitting the vectorizer and converting the counts to an array
        full_fit = vectorizer.fit_transform(df[x_name])
        full_counts = full_fit.toarray()
        self.vocabulary_ = vectorizer.vocabulary_

        #passing the attributes up to the class instance
        self.data = df
        if sparse:
            full_counts = csr_matrix(full_counts)
        self.X = full_counts
        if y_name != None:
            self.y = np.array(df[y_name])
        return

    #splits the data into training and test sets; either called from process()
    #or on its own when your text is already vectorized and divided into x and y
项目:pantip-libr    作者:starcolon    | 项目源码 | 文件源码
def new(n_feature=128):
  vectorizer = CountVectorizer(
    encoding='utf-8',
    ngram_range=(1,1), # Unigram only
    max_features=n_feature, 
    binary=True
  )

  # Fill the gap (missing expected tags)
  # ---
  # Hypothesis: Some tags are somehow related so 
  # we smoothen the missing values with matrix factorisation.
  smoother = NMF(n_components=n_feature)

  # Binarise the vector's individual values 
  binariser = Binarizer(copy=True)

  # Count vectoriser => NMF as smoother => Binariser
  print(colored('Taghasher model created','yellow'))
  return [vectorizer,smoother,binariser]
项目:spice-hate_speech_detection    作者:futurice    | 项目源码 | 文件源码
def bag_of_words(messages, model=None, weighting=''):

    # TODO: Add stemmming or baseform here
    messages, stemmings2baseform =  texttools.stemming_messages_snowball(messages)

    # Create new model for extrating text features if None is given
    if model is None:
        if weighting == 'tfidf':
            model = TfidfVectorizer()
        else:
            model = CountVectorizer()
        model.fit(messages)

    # Extract features
    x = model.transform(messages)

    return x
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def test_build(self):
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        corpus = CorpusFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names,
            raw_texts=newsgroups_train.data
        ).build()
        self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(corpus
                         .get_term_freq_df()
                         .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
        self.assertGreater(len(corpus.get_texts()[0]), 5)
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def test_build(self):
        from sklearn.datasets import fetch_20newsgroups
        from sklearn.feature_extraction.text import CountVectorizer
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        term_doc_mat = TermDocMatrixFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names).build()
        self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(term_doc_mat
                         .get_term_freq_df()
                         .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def make_lda(self, nt, iterations):
        # '''
        #   description: sets important attributes and creates lda model
        #   params:     nt-number of topics for lda
        #               iterations: number of iterations for lda
        #               dim: 2d or 3d grpah
        #               threshold: minimum percentage of the maximum topic in a document which can be included in a "cluster"
        # '''

        self.nt = nt        


        self.cvectorizer = CountVectorizer(min_df=5, stop_words='english')
        cvz = self.cvectorizer.fit_transform(self.descriptions)

        # train an LDA model
        self.lda_model = lda.LDA(n_topics=nt, n_iter=iterations)
        self.X_topics_original = self.lda_model.fit_transform(cvz)

        #initialize current stuff
        self.X_topics_current = self.X_topics_original
        self.titles_current = self.titles_original
项目:political-ad-classifier    作者:BoudhayanBanerjee    | 项目源码 | 文件源码
def countvectorizer(inputpath=None, text=None):
    """
    docstring
    """
    vectorizer = CountVectorizer(min_df=1)
    if inputpath:
        filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
        corpus = []
        for file in filenames:
            with open(file, 'r') as f:
                data = f.read()
                corpus.append(data)
    if text:
        corpus = text

    X = vectorizer.fit_transform(corpus)
    print(X.toarray())
    print(vectorizer.get_feature_names())
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency'):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:nlp    作者:lhyxcxy    | 项目源码 | 文件源码
def getTFIDF():
    """

    :return:
    """
    corpus,textList=getFenCiWords();
    vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
    transformer=TfidfTransformer()#??????????tf-idf??
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
    word=vectorizer.get_feature_names()#????????????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
    return weight, textList
    # for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
    #   print u"-------?????",i,u"??????tf-idf??------"
    # for j in range(len(word)):
    # print word[j],weight[i][j]
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard",
                 generate=True):
        super(MILClassifier, self).__init__()
        self.modelname = modelname
        self.pairtype = pairtype
        self.pairs = {}  # (e1.normalized, e2.normalized) => (e1, e2)
        self.instances = {}  # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities
        self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1)
        self.bag_labels = []  # ordered list of labels for each bag
        self.bag_pairs = []  # ordered list of pair labels (e1.normalized, e2.normalized)
        self.data = []  # ordered list of bags, each is a list of feature vectors
        self.predicted = []  # ordered list of predictions for each bag
        self.resultsfile = None
        self.examplesfile = None
        self.ner_model = ner
        self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b')
        self.corpus = corpus

        #self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=)
        #self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20)
        self.classifier = misvm.sMIL(kernel='linear', C=1)
        #self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20)
        #if generate:
        #    self.generateMILdata(test=test, pairtype=pairtype, relations=relations)
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:TextClassification    作者:mosu027    | 项目源码 | 文件源码
def tfidf_feature(xtrain, xtest, stopwords_path):
    """
    tf-idf feature
    """
    xtrain = [" ".join(word) for word in xtrain]
    xtest = [" ".join(word) for word in xtest]
    stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
    stopwords = [word.strip("\n") for word in stopwords]
    vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
    count_train = vectorizer_train.fit_transform(xtrain)
    vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
    count_test = vectorizer_test.fit_transform(xtest)

    transformer = TfidfTransformer()
    tfidf_train = transformer.fit(count_train).transform(count_train)
    tfidf_test = transformer.fit(count_test).transform(count_test)

    return tfidf_train.toarray(),tfidf_test.toarray()
项目:flexmatcher    作者:biggorilla-gh    | 项目源码 | 文件源码
def __init__(self, ngram_range=(1, 1), analyzer='word', count=True,
                 n_features=200):
        """Initializes the classifier.

        Args:
            ngram_range (tuple): Pair of ints specifying the range of ngrams.
            analyzer (string): Determines what type of analyzer to be used.
            Setting it to 'word' will consider each word as a unit of language
            and 'char' will consider each character as a unit of language.
            count (boolean): Determines if features are counts of n-grams
            versus a binary value encoding if the n-gram is present or not.
            n_features (int): Maximum number of features used.
        """
        # checking what type of vectorizer to create
        if count:
            self.vectorizer = CountVectorizer(analyzer=analyzer,
                                              ngram_range=ngram_range,
                                              max_features=n_features)
        else:
            self.vectorizer = HashingVectorizer(analyzer=analyzer,
                                                ngram_range=ngram_range,
                                                n_features=n_features)
项目:million-post-corpus    作者:OFAI    | 项目源码 | 文件源码
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    fe = CountVectorizer(
        preprocessor=normalize,
        tokenizer=micro_tokenize,
        binary=True,
    )
    predictor = NBSVM_predictor(
        kernel=conf.SVM_KERNEL,
        class_weight=conf.SVM_CLWEIGHT,
        C=conf.SVM_C,
    )
    fe.fit(txt_train)
    X = fe.transform(txt_train)
    predictor.fit(X, y_train)
    X_test = fe.transform(txt_test)
    y_pred = predictor.predict(X_test)

    return y_pred
项目:million-post-corpus    作者:OFAI    | 项目源码 | 文件源码
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    fe = CountVectorizer(
        preprocessor=normalize,
        tokenizer=micro_tokenize,
        binary=True,
    )
    predictor = SVC(
        kernel=conf.SVM_KERNEL,
        class_weight=conf.SVM_CLWEIGHT,
        C=conf.SVM_C,
        random_state=conf.SEED,
    )
    fe.fit(txt_train)
    X = fe.transform(txt_train)
    predictor.fit(X, y_train)
    X_test = fe.transform(txt_test)
    y_pred = predictor.predict(X_test)

    return y_pred
项目:LLString    作者:mitll    | 项目源码 | 文件源码
def compute_VwS(self,s):
        """ Compute V(w,S) as defined by Cohen et al.'s IJCAI03 paper """
        # Get term-frequency vectors and vocab list for string
        cv = CountVectorizer(min_df = 0.0, token_pattern=u'(?u)\\b\\w+\\b')
        tf = cv.fit_transform([s]); tf = tf.tocsr()
        vocab = cv.vocabulary_

        # Compute V(w,S) for string
        vprime_ws = dict()
        vprime_ws_norm = 0
        for w in vocab:
            if w in self.CORPUS_VOCAB:
                vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.LOG_IDF[self.CORPUS_VOCAB[w]]
            else:
                vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.OOV_IDF_VAL #if not in vocab, defauly to OOC_IDF_VAL
            vprime_ws_norm += vprime_ws[w]**2
        vprime_ws_norm = math.sqrt(vprime_ws_norm)

        return (vocab,vprime_ws,vprime_ws_norm)
项目:SpectralLDA-MXNet    作者:Mega-DatA-Lab    | 项目源码 | 文件源码
def bow_to_npy(vocabulary_fname, bow_fname, npy_fname):
    ''' Vectorize bag-of-words dump and save in NumPy file

    PARAMETERS
    -----------
    vocabulary_fname: str or Path
        Vocabulary text file name, with one word on each line.
    bow_fname: str or Path
        Bag-of-words .txt.gz file name. When uncompressed,
        each line represents a document with only lower-case words
        separated by space.
    npy_fname: str or Path
        NumPy .npy file name to write the word count vectors into.
    '''
    with Path(vocabulary_fname).open('r') as vocabulary_file:
        vocabulary = [line.strip() for line in vocabulary_file]

    vectorizer = CountVectorizer(vocabulary=vocabulary)
    with gzip.open(bow_fname, 'rt') as bow_file:
        word_counts = vectorizer.transform(bow_file)

    np.save(npy_fname, word_counts)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def test_read_files(self):
        docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit']
        thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'],
                                 'narrower': ['0n'], 'altLabel': []},
                     '13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'],
                                 'narrower': ['1n'], 'altLabel': ['amet']},
                     }
        vocabulary = {'13542-1': 1, '13542-4': 0}
        fnames = []
        for doc in docs:
            file = NamedTemporaryFile(mode='w', delete=False)
            fnames.append(file.name)
            print(doc, file=file)
        cf = ConceptAnalyzer(thesaurus, input='filename')
        counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary, input='filename')
        res = counter.fit_transform(fnames).todense()
        np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
项目:learn-to-select-data    作者:sebastianruder    | 项目源码 | 文件源码
def get_topic_distributions(examples, vectorizer, lda_model):
    """
    Retrieve the topic distributions of a collection of documents.
    :param examples: a list of tokenised documents
    :param vectorizer: the CountVectorizer used for transforming the documents
    :param lda_model: the trained LDA model
    :return: an array of shape (num_examples, num_topics) containing the topic
             distribution of each example
    """
    vectorized_corpus = vectorizer.transform(examples)
    gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus,
                                                  documents_columns=False)
    topic_representations = []
    for doc in gensim_corpus:
        topic_representations.append(
            [topic_prob for (topic_id, topic_prob) in
             lda_model.get_document_topics(doc, minimum_probability=0.)])
    return np.array(topic_representations)


# PRE-TRAINED WORD EMBEDDINGS METHODS
项目:GitHub-Recommender    作者:himangshunits    | 项目源码 | 文件源码
def get_word_counts(input_str, limit = 100):
        input_str = PreprocessManager.remove_non_ascii(input_str)
        wordnet_lemmatizer = WordNetLemmatizer()
        snowball_stemmer = EnglishStemmer()
        tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
        tokenized_text = [word for word in tokenized_text if len(word) > 1]  # Filter some small words
        #tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
        filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
        stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
        # Calculate frequency distribution
        frequency_dist = nltk.FreqDist(stemmed_list)

        # Output top 50 words
        result = dict()
        for word, frequency in frequency_dist.most_common(limit):
            # print(u'{};{}'.format(word, frequency))
            result[word] = frequency
        return result



    # This function just splits the words and gives the words that's all!
项目:feature_engineering    作者:webeng    | 项目源码 | 文件源码
def getModels(self):
        with open(self.data_path + '/categories.pkl', 'rb') as f:
            categories = cPickle.load(f)

        with open(self.data_path + '/category_map.pkl', 'rb') as f:
            category_map = cPickle.load(f)

        with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
            clf = cPickle.load(f)

        count_vect = CountVectorizer()
        with open(self.data_path + '/count_vect.pkl', 'rb') as f:
            count_vect = cPickle.load(f)

        tfidf_transformer = TfidfTransformer()
        with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
            tfidf_transformer = cPickle.load(f)

        with open(self.data_path + '/tree.pkl', 'rb') as f:
            tree = cPickle.load(f)

        return categories, category_map, clf, count_vect, tfidf_transformer, tree
项目:text-summarizer    作者:gaetangate    | 项目源码 | 文件源码
def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()
        word_list = []
        for i in range(centroid_vector.shape[0]):
            if centroid_vector[i] > self.topic_threshold:
                # print(feature_names[i], centroid_vector[i])
                word_list.append(feature_names[i])

        return word_list
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))



########## Stemmer + CountVectorizer wrapper #############
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))


########## Defaults TF-IDF & Count Vectorizers ########


#======== TF-IDF Vectorizer =========#
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def train_test():
    """Identify accuracy via training set"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)  # creates vocab set and dtm for each raw document!
    X_test_dtm = vect.transform(X_test)

    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)  # make class predictions for X_test_dtm
    # w = list(X_test)
    return metrics.accuracy_score(y_test, y_pred_class)

# print(train_test())
项目:geocoder-ie    作者:devgateway    | 项目源码 | 文件源码
def __init__(self):
        self.clf = LinearSVC()
        self.scores = []
        self.vectorizer = CountVectorizer(token_pattern=r'[A-z]+',  stop_words=english_stops,
                                          ngram_range=(1, 1))
项目:Abb1t    作者:k-freeman    | 项目源码 | 文件源码
def create_speech(self):
        self.speech = dict.fromkeys(self.archives,[]) 
        #blacklist=[] # ids to be ignored, not implemented yet
        self.vectorizer = dict.fromkeys(self.archives,[])
        self.mat = dict.fromkeys(self.archives,[])
        for key in self.speech:
            self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?)
            self.vectorizer[key]=CountVectorizer(min_df=1)
            if key >=0:
                continue # why create dictionaries for private messages right now...
            logfile="{}.gz".format(os.path.join(self.logpath,str(key)))
            try:
                ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:]
            except IOError:
                print("{} not found".format(logfile))
                continue
            prev_id = -1
            for msg_line in ziplines:
                msg = Msg(json.loads(msg_line))
                text=msg.get_text()
                chat_id=msg.get_chat_id()
                if (key != chat_id):
                    input("Error in your logfile (key {} / chat {})!".format(key,chat_id))
                sent_id=msg.get_sent_id()
                if text and text[0] not in ["/","!"]  and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through
                    if sent_id == prev_id:
                        self.speech[key][0][-1]+="\n{}".format(text)
                    else:
                        self.speech[key][0].append(text)
                        self.speech[key][1].append(sent_id)
                    prev_id = sent_id
            if self.speech[key][0]:
                self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])
项目:LDA-REST    作者:valentinarho    | 项目源码 | 文件源码
def compute_tf(data, stopwords_list, language, use_lemmer=True, min_df=2, max_df=0.8):
    """
    Compute the tf matrix for the provided data
    :param language: 'en' or 'it'
    :param data:
    :param stopwords_list:
    :param use_lemmer:
    :param min_df:
    :param max_df:
    :return:
    """
    lemmer_tokenizer = None

    if use_lemmer:
        if language == 'it':
            lemmer_tokenizer = LemNormalizeIt
        else:
            lemmer_tokenizer = LemNormalize

    min_df = min_df if len(data) > min_df else 1
    max_df = max_df if max_df * len(data) >= min_df else 1.0

    # tf
    tf_vectorizer = CountVectorizer(tokenizer=lemmer_tokenizer,
                                    max_df=max_df, min_df=min_df,
                                    max_features=None,
                                    stop_words=stopwords_list,
                                    token_pattern="[a-zA-Z]{3,}")

    try:
        tf = tf_vectorizer.fit_transform(data)
        tf_features_names = tf_vectorizer.get_feature_names()
    except:
        logging.warning('The computed tf matrix is empty. Check stopwords.')
        tf = []
        tf_features_names = []

    return tf, tf_features_names
项目:scik-learn-learn-Chinese-text-classider    作者:chapzq77    | 项目源码 | 文件源码
def voc_count_bag(self):
        if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""):
            print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty."
            return 
        file_obj = open(self.wordbag_path+self.trainset_name,'rb')
        self.data_set = pickle.load(file_obj)
        file_obj.close()
        #??vocabulary_count_bag?????
        self.vocabulary_count_bag.target_name = self.data_set.target_name
        self.vocabulary_count_bag.label =self.data_set.label
        self.vocabulary_count_bag.filenames =self.data_set.filenames
        corpus = self.data_set.content
        stopword_list = self.getstopword(self.stopword_path)
        #??????????,?????????????
        vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000)
        y = vectorizer.fit_transform(corpus)
        self.vocabulary_count_bag.vcm = y
        self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0)
        self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names()
        if not os.path.exists(self.wordbag_path):
            os.makedirs(self.wordbag_path)
        file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb')
        pickle.dump(self.vocabulary_count_bag,file_obj1)
        file_obj1.close()
        print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????"
        print "#######################################"

    #???????
项目:textar    作者:datosgobar    | 项目源码 | 文件源码
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
        """Definido en la declaracion de la clase.

        Attributes:
            texts (list of str): Textos a clasificar.
            ids (list of str): Identificadores únicos para cada texto (debe
                tener la misma longitud que `texts`).
            vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
                vectorización de los textos. Default: usa todas las palabras
                presentes en los textos, salvo los ES_stopwords.txt.
            encoding (str): Codificación de los textos en `texts` y en `ids`.
        """
        this_dir, this_filename = os.path.split(__file__)
        es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
                                   header=None, encoding='utf-8')
        es_stopwords = list(np.squeeze(es_stopwords.values))
        self._check_id_length(ids)
        self.vectorizer = CountVectorizer(
            input='content', encoding=encoding, decode_error='strict',
            strip_accents='ascii', lowercase=True, preprocessor=None,
            tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
            analyzer='word', max_df=0.8, min_df=1, max_features=None,
            vocabulary=vocabulary, binary=False)

        self.transformer = TfidfTransformer()
        self.ids = None  # Matiene una lista ordenada de ids de textos.
        self.term_mat = None  # Matriz que cuenta los terminos en un texto.
        self.tfidf_mat = None  # Matriz de relevancia de los terminos.
        self.reload_texts(texts, ids)