Python sklearn.feature_extraction.text 模块,TfidfTransformer() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.TfidfTransformer()

项目:Papyrus--simple-but-effective-text-summarization-tool    作者:RebeccaMerrett    | 项目源码 | 文件源码
def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
项目:AbTextSumm    作者:StevenLOL    | 项目源码 | 文件源码
def removeSimilarSentences(generatedSentences, originalSentences,  stopwords,threshold=0.80,):
    docs=[]
    for sent, sim in generatedSentences:
        docs.append(sent)
    docs.extend(originalSentences)

    bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    #simMatrix = (normalized[0:] * normalized[0:].T).A
    simindices=[]
    #print 'Num original, ', len(originalSentences)
    for i in xrange(len(generatedSentences)):
        simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
        if(max(simGeneratedScores) >= threshold):
            simindices.append(i)

    #print simindices
    finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
    #print len(generatedSentences), len(finalGen)
    return finalGen
项目:DataScience-And-MachineLearning-Handbook-For-Coders    作者:wxyyxc1992    | 项目源码 | 文件源码
def extract_feature(self):
        """
        ???????????
        """

        # ?????????-???
        self.train_dtm = self.count_vect.fit_transform(self.data['train'].data)

        # ????? TF ??

        tf_transformer = TfidfTransformer(use_idf=False)

        self.train_tf = tf_transformer.transform(self.train_dtm)

        # ????? TF-IDF ??

        tfidf_transformer = TfidfTransformer().fit(self.train_dtm)

        self.train_tfidf = tf_transformer.transform(self.train_dtm)
项目:Emotion-Identification    作者:saopayne    | 项目源码 | 文件源码
def feature(terms):
    dataMatrix = np.genfromtxt(finaltest, delimiter='|', dtype=None, skip_header=True)
    n = dataMatrix.size
    l = len(terms)
    occurence = np.zeros((n, l), dtype=np.int)
    d = 0
    for row in dataMatrix:
        temp = row[0].lower().decode('UTF-8').split(' ')
        for i in range(l):
            if terms[i] in temp:
                occurence[d][i] += 1
        d += 1
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(occurence)
    occurence = tfdif.toarray()
    np.savetxt('occurencetest.csv',occurence,delimiter=',')

    return occurence, dataMatrix
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def Training_model():
    #????????????
    f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt")   # ???????????
    f.readline()   # ????
    data = np.loadtxt(f)
    #?????????
    f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt")
    leibie = np.loadtxt(f1)
    f.close()
    f1.close()

    #TF-IDF??
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(data)
    data1 = tfidf.toarray()

    #SVM?????
    clf = svm.SVC()   # class
    clf.fit(data1, leibie)    # training the svc model
    return clf
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def test_main(self):
        categories, documents = get_docs_categories()
        clean_function = lambda text: '' if text.startswith('[') else text
        entity_types = set(['GPE'])
        term_doc_mat = (
            TermDocMatrixFactory(
                category_text_iter=zip(categories, documents),
                clean_function=clean_function,
                nlp=_testing_nlp,
                feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
            ).build()
        )
        clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
        fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
                           clean_function=clean_function,
                           feats_from_spacy_doc=FeatsFromSpacyDoc(
                               entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
        tfidf = TfidfTransformer(norm='l1')
        X = tfidf.fit_transform(term_doc_mat._X)
        clf.fit(X, term_doc_mat._y)
        X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
        pred = clf.predict(tfidf.transform(X_to_predict))
        dec = clf.decision_function(X_to_predict)
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def get_logistic_regression_coefs_l2(self, category,
                                         clf=RidgeClassifierCV()):
        ''' Computes l2-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        category : str
            category name to score
        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        from sklearn.cross_validation import cross_val_predict
        y = self._get_mask_from_category(category)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y)
        y_hat = cross_val_predict(clf, X, y)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        return clf.coef_[0], acc, baseline
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def get_logistic_regression_coefs_l1(self, category,
                                         clf=LassoCV(alphas=[0.1, 0.001],
                                                     max_iter=10000,
                                                     n_jobs=-1)):
        ''' Computes l1-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        from sklearn.cross_validation import cross_val_predict
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        # X = TfidfTransformer().fit_transform(self._X)
        X = self._X

        clf.fit(X, y_continuous)
        y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        clf.fit(X, y_continuous)
        return clf.coef_, acc, baseline
项目:nlp    作者:lhyxcxy    | 项目源码 | 文件源码
def getTFIDF():
    """

    :return:
    """
    corpus,textList=getFenCiWords();
    vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
    transformer=TfidfTransformer()#??????????tf-idf??
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
    word=vectorizer.get_feature_names()#????????????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
    return weight, textList
    # for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
    #   print u"-------?????",i,u"??????tf-idf??------"
    # for j in range(len(word)):
    # print word[j],weight[i][j]
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:TextClassification    作者:mosu027    | 项目源码 | 文件源码
def tfidf_feature(xtrain, xtest, stopwords_path):
    """
    tf-idf feature
    """
    xtrain = [" ".join(word) for word in xtrain]
    xtest = [" ".join(word) for word in xtest]
    stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
    stopwords = [word.strip("\n") for word in stopwords]
    vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
    count_train = vectorizer_train.fit_transform(xtrain)
    vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
    count_test = vectorizer_test.fit_transform(xtest)

    transformer = TfidfTransformer()
    tfidf_train = transformer.fit(count_train).transform(count_train)
    tfidf_test = transformer.fit(count_test).transform(count_test)

    return tfidf_train.toarray(),tfidf_test.toarray()
项目:feature_engineering    作者:webeng    | 项目源码 | 文件源码
def getModels(self):
        with open(self.data_path + '/categories.pkl', 'rb') as f:
            categories = cPickle.load(f)

        with open(self.data_path + '/category_map.pkl', 'rb') as f:
            category_map = cPickle.load(f)

        with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
            clf = cPickle.load(f)

        count_vect = CountVectorizer()
        with open(self.data_path + '/count_vect.pkl', 'rb') as f:
            count_vect = cPickle.load(f)

        tfidf_transformer = TfidfTransformer()
        with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
            tfidf_transformer = cPickle.load(f)

        with open(self.data_path + '/tree.pkl', 'rb') as f:
            tree = cPickle.load(f)

        return categories, category_map, clf, count_vect, tfidf_transformer, tree
项目:text-summarizer    作者:gaetangate    | 项目源码 | 文件源码
def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()
        word_list = []
        for i in range(centroid_vector.shape[0]):
            if centroid_vector[i] > self.topic_threshold:
                # print(feature_names[i], centroid_vector[i])
                word_list.append(feature_names[i])

        return word_list
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_tf_idf_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # this is robust to features with only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())
项目:kindred    作者:jakelever    | 项目源码 | 文件源码
def _vectorize(self,corpus,fit):
        assert isinstance(corpus,kindred.Corpus)

        matrices = []
        for feature in self.chosenFeatures:
            assert feature in self.featureInfo.keys()
            featureFunction = self.featureInfo[feature]['func']
            never_tfidf = self.featureInfo[feature]['never_tfidf']
            data = featureFunction(corpus)
            notEmpty = any( len(d)>0 for d in data )
            if fit:
                if notEmpty:
                    self.dictVectorizers[feature] = DictVectorizer()
                    if self.tfidf and not never_tfidf:
                        self.tfidfTransformers[feature] = TfidfTransformer()
                        intermediate = self.dictVectorizers[feature].fit_transform(data)
                        matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].fit_transform(data))
            else:
                if feature in self.dictVectorizers:
                    if self.tfidf and not never_tfidf:
                        intermediate = self.dictVectorizers[feature].transform(data)
                        matrices.append(self.tfidfTransformers[feature].transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].transform(data))

        mergedMatrix = hstack(matrices)
        return mergedMatrix
项目:textar    作者:datosgobar    | 项目源码 | 文件源码
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
        """Definido en la declaracion de la clase.

        Attributes:
            texts (list of str): Textos a clasificar.
            ids (list of str): Identificadores únicos para cada texto (debe
                tener la misma longitud que `texts`).
            vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
                vectorización de los textos. Default: usa todas las palabras
                presentes en los textos, salvo los ES_stopwords.txt.
            encoding (str): Codificación de los textos en `texts` y en `ids`.
        """
        this_dir, this_filename = os.path.split(__file__)
        es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
                                   header=None, encoding='utf-8')
        es_stopwords = list(np.squeeze(es_stopwords.values))
        self._check_id_length(ids)
        self.vectorizer = CountVectorizer(
            input='content', encoding=encoding, decode_error='strict',
            strip_accents='ascii', lowercase=True, preprocessor=None,
            tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
            analyzer='word', max_df=0.8, min_df=1, max_features=None,
            vocabulary=vocabulary, binary=False)

        self.transformer = TfidfTransformer()
        self.ids = None  # Matiene una lista ordenada de ids de textos.
        self.term_mat = None  # Matriz que cuenta los terminos en un texto.
        self.tfidf_mat = None  # Matriz de relevancia de los terminos.
        self.reload_texts(texts, ids)
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def case1():
    from sklearn import datasets
    news = datasets.fetch_20newsgroups(subset='all')
    # print len(news.data)
    # print len(news.target)

    # print '*'*10
    # print news.data[0]
    # print '*'*10
    # print news.target[0]
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    vec = CountVectorizer()
    x = vec.fit_transform(news.data)
    # print x.shape
    # print x[:2]
    print x[:10,:10].toarray()
    TFIDF = TfidfTransformer()
    x_tfidf = TFIDF.fit_transform(x)
    print x_tfidf[:10,:10].toarray()


    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)

    tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)


    from sklearn.naive_bayes import MultinomialNB
    mnb =MultinomialNB()
    tf_mnb = MultinomialNB()

    mmb.fit(Xtrain,ytrain)
    tf_mnb.fit(tf_Xtrain,tf_ytrain)
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:AlphaPy    作者:ScottFreeLLC    | 项目源码 | 文件源码
def cvectorize(f, c, n):
    r"""Use the Count Vectorizer and TF-IDF Transformer.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe containing the column ``c``.
    c : str
        Name of the text column in the dataframe ``f``.
    n : int
        The number of n-grams.

    Returns
    -------
    new_features : sparse matrix
        The transformed features.

    References
    ----------
    To use count vectorization and TF-IDF, you can find more
    information here [TFE]_.

    .. [TFE] http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

    """
    fc = f[c]
    fc.fillna(BSEP, inplace=True)
    cvect = CountVectorizer(ngram_range=[1, n], analyzer='char')
    cfeat = cvect.fit_transform(fc)
    tfidf_transformer = TfidfTransformer()
    new_features = tfidf_transformer.fit_transform(cfeat).toarray()
    return new_features


#
# Function apply_treatment
#
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False, **kwargs):
        self.tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
                                      smooth_idf=smooth_idf,
                                      sublinear_tf=sublinear_tf)

        # override defaults since we need the counts here
        self.verbose = kwargs.get('verbose', 0)

        binary = kwargs.pop('binary', False)
        dtype = kwargs.pop('dtype', np.int64)

        # pass remaining args to countvectorizer
        self._init_params(name="TFIDF", binary=binary, dtype=dtype, **kwargs)
项目:DataScience-And-MachineLearning-Handbook-For-Coders    作者:wxyyxc1992    | 项目源码 | 文件源码
def predict(self, docs):
        """
        ???????????
        """

        X_new_counts = self.count_vect.transform(docs)

        tfidf_transformer = TfidfTransformer().fit(X_new_counts)

        X_new_tfidf = tfidf_transformer.transform(X_new_counts)

        return self.clf.predict(X_new_tfidf)
项目:MachineLearningProject    作者:ymynem    | 项目源码 | 文件源码
def normalize(counts):
    transformer = TfidfTransformer(smooth_idf=1)
    return transformer.fit_transform(counts).toarray()
项目:semeval2016-task4    作者:aesuli    | 项目源码 | 文件源码
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:Emotion-Identification    作者:saopayne    | 项目源码 | 文件源码
def feature():
    global termcount
    dataMatrix = np.genfromtxt(finaltrial, delimiter='|', dtype=None, skip_header=True)
    terms = []
    n = dataMatrix.size
    for row in dataMatrix:
        row[0] = row[0].lower().decode('UTF-8')
        temp = row[0].decode('UTF-8').replace(' ', '+')
        temp = (get.urlopen("http://localhost:5095/parser?sentence=" + temp).read()).decode('UTF-8')
        terms.extend([x.split('/')[0] for x in temp.split(' ') if
                      x.split('/')[1] == 'JJ' or x.split('/')[1].startswith('VB')])
        tfidf(temp)
    s = sum(list(termcount.values()))
    termcount = {x: (y * 100 / s) for x, y in zip(termcount.keys(), termcount.values())}
    # terms.extend([x for x in termcount.keys()])
    terms = list(set(terms))
    stop = open('stop.csv', 'r').read().splitlines()
    terms = [x for x in terms if x not in stop]
    l = len(terms)
    occurence = np.zeros((n, l), dtype=np.int)
    d = 0
    for row in dataMatrix:
        temp = row[0].decode('UTF-8').split(' ')
        for i in range(l):
            if terms[i] in temp:
                occurence[d][i] += 1
        d += 1
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(occurence)
    occurence = tfdif.toarray()


    np.savetxt('occurence.csv',occurence,delimiter=',')
    return occurence, dataMatrix, terms
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def avg_spelling_error(lang=None):
    pipeline = Pipeline([('feature', SpellingError(language=lang)),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('avg_spelling_error', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def punctuation_features():
    pipeline = Pipeline([('feature', PunctuationFeatures()),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('punctuation_features', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def word_bigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
                                                  ngram_range=(2, 2))),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_bigrams', pipeline)
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def char_ngrams():
    vectorizer = CountVectorizer(min_df=1,
                                 preprocessor=TextCleaner(filter_urls=True,
                                                          filter_mentions=True,
                                                          filter_hashtags=True,
                                                          lowercase=False),
                                 analyzer='char_wb',
                                 ngram_range=(4, 4))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('char_ngrams', pipeline)
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def TFIDF_result():
    str_handel_list = read_handel_list()   # ??30?????????????????????str
    str_test = read_test_list()  # ?????????????????str
    # ??TF-IDF???
    corpus = str_handel_list[:]  # TF-IDF????
    corpus.append(str_test)    # ????????????
    print "TF-IDF corpus building success..."
    ######################### ??scikit-learn?? TF-IDF????
    # ??????????????????????a[i][j] ??j??i???????
    vectorizer = CountVectorizer()
    # ??????????tf-idf??
    transformer = TfidfTransformer()
    # ???fit_transform???tf-idf????fit_transform??????????
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    # ????????????
    word = vectorizer.get_feature_names()
    # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    weight = tfidf.toarray()
    print "TF-IDF score is calcuated success..."
    # ???30???????????TF-IDF??
    results = []
    for j in range(len(word)):
        if word[j] == '??' or word[j] == '??' or len(word[j]) == 1:  # ??????????1??
            continue
        results.append((word[j], weight[30][j]))  # ??????????
    sorted_results = sorted(results, key=lambda result: result[1], reverse=True)   # ??????
    # ?TF-IDF???100????
    fp_tfidf_result = open("f://emotion/mysite/Label_extract/result_tfidf.txt", 'w+')
    tfidf_results = []
    for i in range(100):   # ???????100??????????????
        tfidf_results.append((sorted_results[i][0], sorted_results[i][1]))
        fp_tfidf_result.write(sorted_results[i][0] + ' ' + str(round(sorted_results[i][1], 10)))
        fp_tfidf_result.write('\n')
    fp_tfidf_result.close()
    return tfidf_results
项目:scattertext    作者:JasonKessler    | 项目源码 | 文件源码
def _fit_tfidf_model(self, category, clf):
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y_continuous)
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def fit_tfidf(count_vector):
    '''
    Fits a term frequency matrix on a count vector.
    '''
    tfidf_vector = TfidfTransformer(use_idf=False).fit(count_vector)
    return tfidf_vector
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def fit_tfidf(count_vector):
    '''
    Transforms a count vector into a tf vector.
    TF: count vector normalized on legnth of docs.
    '''
    tfidf = TfidfTransformer(use_idf=False)
    tfidf_vector = tfidf.fit(count_vector)
    return tfidf_vector
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def fit_tfidf(count_vector):
    tfidf = TfidfTransformer(use_idf=False)
    tfidf_vector = tfidf.fit(count_vector)
    return tfidf_vector
项目:IAAT    作者:rfrugte    | 项目源码 | 文件源码
def train_sgdc(training_list):
    footnotes=[]
    cate=[]
    for i in training_list:
        footnotes.append(i[0])
        cate.append(i[1])  
    text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,n_iter=5, random_state=42)),])
    _ = text_clf.fit(footnotes,cate)
    return text_clf
项目:context_predictive_words    作者:Cogitans    | 项目源码 | 文件源码
def parseToBOW():
    vectorizer = CountVectorizer(min_df=1)
    texts = pickle.load(open(OUTFILE, 'rb'))[0]
    tdm = vectorizer.fit_transform(texts)
    transformer = TfidfTransformer()
    tdidf = transformer.fit_transform(tdm)
    f = open(DATASET_PATH + "BOW.p", "wb")
    pickle.dump(tdm, f)
    f.close()
    f = open(DATASET_PATH + "BOW_TDIDF.p", "wb")
    pickle.dump(tdidf, f)
    f.close()
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def tfidf_transformer(bow_matrix):

    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
项目:event-cui-transfer    作者:mit-ddig    | 项目源码 | 文件源码
def transformTFIDF(X_train_all, X_test_all):
    """Transform bag-of-events using TF-IDF.

    Arguments
    ---------
    X_train_all: pandas DataFrame
    X_test_all: pandas DataFrame

    Returns
    -------
    X_train_t: CSR matrix
    X_test_t: CSR matrix
    """

    tfidf_t = TfidfTransformer(norm='l2',
                               use_idf=True,
                               sublinear_tf=True,
                               smooth_idf=True)
    X_train = scipy.sparse.csr_matrix(X_train_all)
    X_test = scipy.sparse.csr_matrix(X_test_all)
    # Fit TFIDF using training data.
    tfidf_t.fit(X_train)
    # Transform both training and test data.
    X_train_t = tfidf_t.transform(X_train)
    X_test_t = tfidf_t.transform(X_test)
    return X_train_t, X_test_t
项目:django_text_classifier    作者:django-text-classifier    | 项目源码 | 文件源码
def get_pipeline(name):
    x = TrainingSet.objects.filter(classifier=name).values_list('body',
                                                                flat=True)
    y = TrainingSet.objects.filter(classifier=name).values_list('target',
                                                                flat=True)
    pipeline = Pipeline([
         ('vector', CountVectorizer()),
         ('transform', TfidfTransformer()),
         ('bayes', MultinomialNB())
    ])

    pipeline.fit(x, y)

    return pipeline
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
项目:nlp    作者:lhyxcxy    | 项目源码 | 文件源码
def kmeans(class_num):
    """
    kmeans ??
    :param class_num: ????
    :return:class_list[[??1???2],[??1???2]]
    """
    class_list=list();
    sentences_words,sentences=loadFile()
    vectorizer = CountVectorizer()  # ??????????????????????a[i][j] ??j??i???????
    transformer = TfidfTransformer()  # ??????????tf-idf??
    # ???fit_transform???tf-idf????fit_transform??????????
    #?????words_list  ???["? ? ?? ???","?? ??"] ?????????????list
    tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences_words))

    #weight ???shape=[????????] ???????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    clf = KMeans(n_clusters=class_num)
    s = clf.fit(weight)
    for i in range(class_num):
        class_list.append(list())
    print clf.labels_
    for i in range(len(clf.labels_)):#clf.labels_ ??????????[1,3,2,5,0,3,5,4,1] ???????????
        class_label=clf.labels_[i]
        class_list[class_label].append(sentences[i])
        #print "#######?"+str(clf.labels_[i])+"?"+words_list[i]
    return class_list;
项目:LLString    作者:mitll    | 项目源码 | 文件源码
def __init__(self,min_df=2,norm="l2"):
        """ Constructor """
        self.cv = CountVectorizer(min_df=min_df)
        self.tfidf = TfidfTransformer(norm)

        self.LOG_IDF = None
        self.CORPUS_VOCAB = None
        self.OOV_IDF_VAL = 0 #min idf value to assign for out-of-vocabulary terms

        self.IDF_MODEL = dict()
项目:LLString    作者:mitll    | 项目源码 | 文件源码
def compute_query_idf(self,corpus):
        """ Compute IDF from s and t in case you have no externally computed IDF to use """
        cv = CountVectorizer(min_df = 0.0)
        cv.fit_transform(corpus)
        self.logger.debug(cv.vocabulary_)
        freq_term_matrix = cv.transform(corpus)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        log_idf = tfidf.idf_
        self.LOG_IDF = log_idf
        self.CORPUS_VOCAB = cv.vocabulary_
项目:vae_sparse    作者:rahulk90    | 项目源码 | 文件源码
def getTF(dataset):
    tfidf = TfidfTransformer(norm=None)
    tfidf.fit(dataset['train'])
    return tfidf.idf_
项目:vae_sparse    作者:rahulk90    | 项目源码 | 文件源码
def getTF(dataset):
    tfidf = TfidfTransformer(norm=None)
    tfidf.fit(dataset['train'])
    return tfidf.idf_
项目:MorphoBabushka    作者:nvanva    | 项目源码 | 文件源码
def tfidf_pipeline(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=False, pos_features=False, clf=LinearSVC()):
    return Pipeline([
        ('mapper', mapper(df, ngram_range, lowercase, binary, min_df, max_df, caps_features, pos_features)),
        ('scaler', TfidfTransformer()),
        ('clf', clf),
    ])
项目:DRM    作者:JohnZhengHub    | 项目源码 | 文件源码
def file2mat(filename):
    transformer = TfidfTransformer()
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
    data = load(filename)
    reviews = [each_data['review'] for each_data in data]
    bag_of_word = vectorizer.fit_transform(reviews)
    tfidf = transformer.fit_transform(bag_of_word)

    aspect_label = collect_aspect_label(data)
    rating_label = collect_rating_label(data)
    return tfidf, aspect_label, rating_label

# ??wordVec ????? ?????????