Python sklearn.feature_extraction.text 模块,TfidfVectorizer() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_extraction.text.TfidfVectorizer()

项目:ml-projects    作者:saopayne    | 项目源码 | 文件源码
def represent(documents):

    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    # Tokenization
    vectorizer = TfidfVectorizer(tokenizer=tokenize)

    # Learn and transform train documents
    vectorised_train_documents = vectorizer.fit_transform(train_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
    test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

    return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def getTFV(token_pattern = token_pattern,
           norm = tfidf__norm,
           max_df = tfidf__max_df,
           min_df = tfidf__min_df,
           ngram_range = (1, 1),
           vocabulary = None,
           stop_words = 'english'):
    tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None, 
                         strip_accents='unicode', analyzer='word', 
                         token_pattern=token_pattern,
                         ngram_range=ngram_range, use_idf=True, 
                         smooth_idf=True, sublinear_tf=True,
                         stop_words = stop_words, norm=norm, vocabulary=vocabulary)
    return tfv   


#========= CountVectorizer =========#
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def __init__(self, column_descriptions=None):
        self.column_descriptions = column_descriptions
        self.text_col_indicators = set(['text', 'nlp'])

        self.text_columns = {}
        for key, val in self.column_descriptions.items():
            if val in self.text_col_indicators:
                self.text_columns[key] = TfidfVectorizer(
                    # If we have any documents that cannot be decoded properly, just ignore them and keep going as planned with everything else
                    decode_error='ignore'
                    # Try to strip accents from characters. Using unicode is slightly slower but more comprehensive than 'ascii'
                    , strip_accents='unicode'
                    # Can also choose 'character', which will likely increase accuracy, at the cost of much more space, generally
                    , analyzer='word'
                    # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                    , stop_words='english'
                    # Convert all characters to lowercase
                    , lowercase=True
                    # Only consider words that appear in fewer than max_df percent of all documents
                    # In this case, ignore all words that appear in 90% of all documents
                    , max_df=0.9
                    # Consider only the most frequently occurring 3000 words, after taking into account all the other filtering going on
                    , max_features=3000
                )
项目:glassdoor-analysis    作者:THEdavehogue    | 项目源码 | 文件源码
def fit_tfidf(self, df):
        '''
        Function to fit a TF-IDF matrix to a corpus of text

        INPUT:
            df: df with 'lemmatized_text' to analyze
        '''
        self.tfidf = TfidfVectorizer(input='content',
                                     use_idf=True,
                                     lowercase=True,
                                     max_features=self.tfidf_max_features,
                                     max_df=self.tfidf_max_df,
                                     min_df=self.tfidf_min_df)
        self.tfidf_matrix = self.tfidf.fit_transform(
            df['lemmatized_text']).toarray()
        self.tfidf_features = np.array(self.tfidf.get_feature_names())
        self.tfidf_reverse_lookup = {
            word: idx for idx, word in enumerate(self.tfidf_features)}
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def create_vectorizer_selector(train_data, train_labels, model_file,
                               ngram_list=[1], max_num_features_list=[100],
                               analyzer_type_list=['word']):
    """Call creation and save of vectorizers and selectors including special cases.

    Args:
        train_data: list of train text samples
        train_labels:  list of train labels
        model_file: model filename
        ngram_list: list of ranges of n-grams
        max_num_features_list: list of maximum number of features to select
        analyzer_type_list: list of analyzer types for TfidfVectorizer 'word' or 'char'

    Returns:
        nothing
    """
    for i in range(len(ngram_list)):
        ngrams_selection(train_data, train_labels, 'general_' + str(i), model_file,
                         ngram_range_=(ngram_list[i], ngram_list[i]),
                         max_num_features=max_num_features_list[i],
                         analyzer_type=analyzer_type_list[i])
    you_are_data = ngrams_you_are(train_data)
    ngrams_selection(you_are_data, train_labels, 'special', model_file,
                     ngram_range_=(1,1), max_num_features=100)
    return
项目:sef    作者:passalis    | 项目源码 | 文件源码
def load_20ng_dataset_bow():
    """
    Loads the 20NG dataset
    :return:
    """

    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    # Convert data to tf-idf

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
    train_data = vectorizer.fit_transform(newsgroups_train.data)
    test_data = vectorizer.transform(newsgroups_test.data)
    train_data = train_data.todense()
    test_data = test_data.todense()
    train_labels = newsgroups_train.target
    test_labels = newsgroups_test.target

    return train_data, train_labels, test_data, test_labels
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def fit(self, X_df, y=None):

        # See if we should fit TfidfVectorizer or not
        for key in X_df.columns:

            if key in self.text_columns:
                X_df[key].fillna('nan', inplace=True)
                text_col = X_df[key].astype(str, raise_on_error=False)
                self.text_columns[key].fit(text_col)

                col_names = self.text_columns[key].get_feature_names()

                # Make weird characters play nice, or just ignore them :)
                for idx, word in enumerate(col_names):
                    try:
                        col_names[idx] = str(word)
                    except:
                        col_names[idx] = 'non_ascii_word_' + str(idx)

                col_names = ['nlp_' + key + '_' + str(word) for word in col_names]

                self.text_columns[key].cleaned_feature_names = col_names

        return self
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_
项目:geomdn    作者:afshinrahimi    | 项目源码 | 文件源码
def tfidf(self):
        #keep both hashtags and mentions
        #token_pattern=r'(?u)@?#?\b\w\w+\b'
        #remove hashtags and mentions
        #token_pattern = r'(?u)(?<![#@])\b\w+\b'
        #just remove mentions and remove hashsign from hashtags
        #token_pattern = r'(?u)(?<![@])\b\w+\b'
        #remove mentions but keep hashtags with their sign
        #token_pattern = r'(?u)(?<![@])#?\b\w\w+\b'
        #remove multple occurrences of a character after 2 times yesss => yess
        #re.sub(r"(.)\1+", r"\1\1", s)
        self.vectorizer = TfidfVectorizer(tokenizer=self.tokenizer, token_pattern=self.token_pattern, use_idf=self.idf, 
                                    norm=self.norm, binary=self.btf, sublinear_tf=self.subtf, 
                                    min_df=self.mindf, max_df=self.maxdf, ngram_range=(1, 1), stop_words=self.stops, 
                                     vocabulary=self.vocab, encoding=self.encoding, dtype='float32')
        logging.info(self.vectorizer)
        self.X_train = self.vectorizer.fit_transform(self.df_train.text.values)
        self.X_dev = self.vectorizer.transform(self.df_dev.text.values)
        self.X_test = self.vectorizer.transform(self.df_test.text.values)
        logging.info("training    n_samples: %d, n_features: %d" % self.X_train.shape)
        logging.info("development n_samples: %d, n_features: %d" % self.X_dev.shape)
        logging.info("test        n_samples: %d, n_features: %d" % self.X_test.shape)
项目:PPRE    作者:MaoYuwei    | 项目源码 | 文件源码
def loadDataset():
    '''???????'''
    df = pd.read_csv('df_vec.csv')
    # print df.shape
    X = np.array(df.iloc[:, 1:])
    y = np.array(df.iloc[:, 0])
    # print y
    # bet_list = list(df.iloc[:, 0])
    # dataset = []
    # for bet in bet_list:
    #     s, bet = bet.split(':')
    #     dataset.append(bet)

    # print dataset
    # print X
    # print y
    return X, y


# def transform(dataset, n_features=1000):
#     vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
#     X = vectorizer.fit_transform(dataset)
#     print X
#     # print vectorizer
#     return X, vectorizer
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_binary(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      epsilon=0.1,
                                                      eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                                                      loss='log',
                                                      n_iter=10,
                                                      n_jobs=1,
                                                      penalty='l2',
                                                      power_t=0.5,
                                                      random_state=None,
                                                      shuffle=True,
                                                      verbose=0,
                                                      warm_start=False
            )))
        ])
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
    """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.

        :param str verb_token: Surface form of a verb, e.g., *born*
        :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
         used to transform verbs into vectors
        :return: cosine similarity score
        :rtype: ndarray
    """
    verb_token_vector = vectorizer.transform([verb_token])
    # Here the linear kernel is the same as the cosine similarity, but faster
    # cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    scores = linear_kernel(verb_token_vector, tf_idf_matrix)
    logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
    logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
    return scores
项目:topic-ensemble    作者:derekgreene    | 项目源码 | 文件源码
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
    """
    Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
    """
    token_pattern = re.compile(r"[\s\-]+", re.U)

    def custom_tokenizer( s ):
        return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[ v[term] ] = term
    return (X,terms)
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def _vectorize_documents(self,method='tfidf',max_features=100):
        stop_words = []

        try:
            for lexicon_id in self.params['cluster_lexicons']:
                lexicon = Lexicon.objects.get(id=int(lexicon_id))
                words = Word.objects.filter(lexicon=lexicon)
                stop_words+=[word.wrd for word in words]
        except:
            KeyError

        if method == 'count':
            vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
        if method == 'tfidf':
            vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)

        document_vectors = vectorizer.fit_transform(self.documents)
        document_vectors = document_vectors.toarray()

        return document_vectors,vectorizer.get_feature_names()
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def generateTfIdfVectorizer(data, stop='english', max_df=0.08, min_df=8):
    tokenizer = tokenizer_snowball if stop != 'english' else tokenizer_porter

    tfidf = TfidfVectorizer(strip_accents=None,
                            max_df=max_df,
                            min_df=min_df,
                            lowercase=True,
                            stop_words=stop,
                            sublinear_tf=True,
                            tokenizer=tokenizer,
                            analyzer='word',
                            max_features=16,
                            preprocessor=preprocessor)
    X = tfidf.fit_transform(data)
    print('%d Features: %s' %
          (len(tfidf.get_feature_names()), tfidf.get_feature_names()))

    return X
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def gridSearch(data, params, true_k):

    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=True,
                            sublinear_tf=True,
                            analyzer='word')

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf', KMeans(init='k-means++',
                                        n_jobs=-1,
                                        random_state=0,
                                        verbose=0))])
    gsTfIdf = GridSearchCV(
        lr_tfidf, params, n_jobs=1, verbose=1)

    gsTfIdf.fit(data)
    print()
    print("Best score: %0.3f" % gsTfIdf.best_score_)
    print("Best parameters set:")
    best_parameters = gsTfIdf.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
        """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
        """
        if self.is_weight == 'FP':#Feature Presence
            vectorizer = CountVectorizer(ngram_range=ngram_range,
                                         tokenizer=self.tokenize,
                                         min_df=min_df,
                                         max_df=max_df,
                                         binary=True,
                                         stop_words='english')

        if self.is_weight == 'TF-IDF':#Feature Presence    
            vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                        tokenizer=self.tokenize,
                                         min_df=min_df, 
                                         max_df=max_df, 
                                         binary=True,
                                         stop_words='english')
        return vectorizer
项目:PolBotCheck    作者:codeforfrankfurt    | 项目源码 | 文件源码
def get_word_clouds(tweets, users, words_n=50, lang='english'):
    default_stopwords = set(nltk.corpus.stopwords.words(lang))
    stopwords_file = '../data/stopwords.txt'
    custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
    all_stopwords = default_stopwords | custom_stopwords

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
    X = vectorizer.fit_transform(tweets)
    terms = vectorizer.get_feature_names()

    word_cloud_per_person = {}
    for doc in range(len(tweets)):
        feature_index = X[doc, :].nonzero()[1]
        tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
        doc_terms = []
        for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
            doc_terms.append((word, score))
        important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
        word_cloud_per_person[users[doc]] = important_terms
    return word_cloud_per_person
项目:sfsf    作者:jorisvanzundert    | 项目源码 | 文件源码
def delegate_create( self, top, bottom, sample_size=1000, source=sfsf_config.EPUB ):
        top_sellers, bottom_sellers = top, bottom
        if source == sfsf_config.EPUB:
            training_data_top = self.sample_epubs( top_sellers, sample_size )
            training_data_bottom = self.sample_epubs( bottom_sellers, sample_size )
        else:
            training_data_top = self.sample_txts( top_sellers, sample_size )
            training_data_bottom = self.sample_txts( bottom_sellers, sample_size )
        training_samples_top = [ sample for training_data in training_data_top for sample in training_data[1] ]
        training_samples_bottom = [ sample for training_data in training_data_bottom for sample in training_data[1] ]
        isbns = [ training_data[0] for training_data in training_data_top for sample in training_data[1] ] + [ training_data[0] for training_data in training_data_bottom for sample in training_data[1] ]
        y_narr = numpy.array( [1] * len( training_samples_top ) + [0] * len( training_samples_bottom ) )
        vect = TfidfVectorizer( tokenizer = MorePunctuationTokenizer() )
        x_tdm = vect.fit_transform( training_samples_top + training_samples_bottom )
        print( 'Created training data', ':' )
        print( 'x shape', ':', x_tdm.shape )
        print( 'y shape', ':', y_narr.shape )
        # TODO: make a nicer return structure
        return { 'x': x_tdm, 'y': y_narr, 'vectorizer': vect, 'isbns': isbns }
项目:SentiCR    作者:senticr    | 项目源码 | 文件源码
def create_model_from_training_data(self):
        training_comments=[]
        training_ratings=[]
        print("Training classifier model..")
        for sentidata in self.training_data:
            comments = preprocess_text(sentidata.text)
            training_comments.append(comments)
            training_ratings.append(sentidata.rating)

        # discard stopwords, apply stemming, and discard words present in less than 3 comments
        self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5,
                                     stop_words=mystop_words, min_df=3)
        X_train = self.vectorizer.fit_transform(training_comments).toarray()
        Y_train = np.array(training_ratings)

        #Apply SMOTE to improve ratio of the minority class
        smote_model = SMOTE(ratio=0.5, random_state=None, k=None, k_neighbors=15, m=None, m_neighbors=15, out_step=.0001,
                   kind='regular', svm_estimator=None, n_jobs=1)

        X_resampled, Y_resampled=smote_model.fit_sample(X_train, Y_train)

        model=self.get_classifier()
        model.fit(X_resampled, Y_resampled)

        return model
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)
项目:data_programming    作者:kep1616    | 项目源码 | 文件源码
def construct_tf_idf_matrix(data, store=False):

    print ("TF-IDF Normalized Matrix Construction...")

    vectorizer = TfidfVectorizer(stop_words='english')
    print(data)
    training_data = vectorizer.fit_transform(data)

    print ("Done Constructing Matrix")
    print(training_data.toarray())
    if store:
        print ("Pickling Trained Transformer...")
        pickle.dump(vectorizer, open(path_config.TRANSFORMER_PICKLING_FILE, 'wb'))
        print ("Pickling Done.")

    return training_data
项目:MLAB_Intuit    作者:rykard95    | 项目源码 | 文件源码
def rf_categorize(email):
    # get training corpus
    emails = []
    db = utils.get_local_db()
    for collection in db.collection_names():
        for record in db.get_collection(collection).find():
            emails.append([collection] + [record['Text']])

    # vectorize corpus
    labels = [row[0] for row in emails]
    data = [row[1] for row in emails]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    X = X.toarray()

    # vectorize input
    email_vector = vectorizer.transform([email])

    # create random forest and return prediction
    forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1)
    forest.fit(X, labels)
    return forest.predict(email_vector)[0]
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_ngram_model(params=None):
    def preprocessor(tweet):
        global emoticons_replaced
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    clf = MultinomialNB()
    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:Using-machine-learning-to-detect-malicious-URLs    作者:faizann24    | 项目源码 | 文件源码
def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs
项目:document_classification    作者:scotthlee    | 项目源码 | 文件源码
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
        #choosing the particular flavor of vectorizer
        if method == 'counts':
            vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
        elif method == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')

        #fitting the vectorizer and converting the counts to an array
        full_fit = vectorizer.fit_transform(df[x_name])
        full_counts = full_fit.toarray()
        self.vocabulary_ = vectorizer.vocabulary_

        #passing the attributes up to the class instance
        self.data = df
        if sparse:
            full_counts = csr_matrix(full_counts)
        self.X = full_counts
        if y_name != None:
            self.y = np.array(df[y_name])
        return

    #splits the data into training and test sets; either called from process()
    #or on its own when your text is already vectorized and divided into x and y
项目:spice-hate_speech_detection    作者:futurice    | 项目源码 | 文件源码
def bag_of_words(messages, model=None, weighting=''):

    # TODO: Add stemmming or baseform here
    messages, stemmings2baseform =  texttools.stemming_messages_snowball(messages)

    # Create new model for extrating text features if None is given
    if model is None:
        if weighting == 'tfidf':
            model = TfidfVectorizer()
        else:
            model = CountVectorizer()
        model.fit(messages)

    # Extract features
    x = model.transform(messages)

    return x
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def spams_count(texts):
    """ Returns the number of spams from a list of (type, text) tuples.

    Args:
        texts: a list of (type, text) tuples.
    Returns:
        an integer representing the number of spams.
    """
    spams_count = 0
    for t, _ in texts:
        # t=1 if it's a spam, 0 if not
        spams_count += t

    return spams_count


# Ex 1.3
# See http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# for the parameters
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def transform_text(pairs):
    """ Transforms the pair data into a matrix X containing tf-idf values
     for the messages and a vector y containing 0s and 1s (for hams and spams
     respectively).
     Row i in X corresponds to the i-th element of y.

    Args:
        pairs: a list of (type, message) tuples.
    Returns:
        X: a sparse TF-IDF matrix where each row represents a message and each
        column represents a word.
        Y: a vector whose i-th element is 0 if the i-th message is a ham, else
        1.
    """
    tfidf = TfidfVectorizer(stop_words="english")
    types, texts = zip(*pairs)

    X = tfidf.fit_transform(texts)
    # Convert the list to a Numpy array because some sklearn objects don't
    # accept lists.
    y = np.array(types)

    return X, y

# Ex 2
项目:document-qa    作者:allenai    | 项目源码 | 文件源码
def prune(self, question, paragraphs: List[ExtractedParagraph]):
        if not self.filter_dist_one and len(paragraphs) == 1:
            return paragraphs

        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
        text = []
        for para in paragraphs:
            text.append(" ".join(" ".join(s) for s in para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        dists = pairwise_distances(q_features, para_features, "cosine").ravel()
        sorted_ix = np.lexsort(([x.start for x in paragraphs], dists))  # in case of ties, use the earlier paragraph

        if self.filter_dist_one:
            return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
        else:
            return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]
项目:document-qa    作者:allenai    | 项目源码 | 文件源码
def dists(self, question, paragraphs: List[ExtractedParagraph]):
        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
        text = []
        for para in paragraphs:
            text.append(" ".join(" ".join(s) for s in para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        dists = pairwise_distances(q_features, para_features, "cosine").ravel()
        sorted_ix = np.lexsort(([x.start for x in paragraphs], dists))  # in case of ties, use the earlier paragraph

        if self.filter_dist_one:
            return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
        else:
            return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def texts_tfidf(ids, important_texts, citations_texts) :
    '''
    Generates tf-idf vectors for each text then calculates cosine similarity between the vectors. 
    '''

    tfidf = TfidfVectorizer(strip_accents='ascii',
                                                    stop_words='english', 
                                                    ngram_range=(1,2),
                                                    min_df=2)

    freqs1 = tfidf.fit_transform(important_texts)
    terms1 = tfidf.get_feature_names()

    freqs2 = tfidf.fit_transform(citations_texts)
    terms2 = tfidf.get_feature_names()

    return terms1, terms2, freqs1, freqs2
项目:job-salary-prediction    作者:soton-data-mining    | 项目源码 | 文件源码
def precomputed_similarity(self):
        # calculate similarity between train an testset job descriptions
        # this is of high order complexity - test it on a subset of the data
        corpus_list = pandas_vector_to_list(self.description_train_data)
        queries_list = pandas_vector_to_list(self.description_test_data)
        self.free_memory()
        print('{}: starting to vectorize description'.format(self.__class__.__name__))
        # use custom vectorizer to cut of min/max 1% of df since they carry little information
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, min_df=0.05,
                                     max_df=0.99)
        vectorizer, corpus_vector, queries_vector = tfidf_vectorize(corpus_list,
                                                                    queries_list,
                                                                    tfidf_vectorizer=vectorizer)
        print("vocabulary size: {}".format(len(vectorizer.get_feature_names())))

        self.store_precomputed_data(corpus_vector, queries_vector,
                                    self.y_train, self.y_test)
        self.load_precomputed_data()
项目:job-salary-prediction    作者:soton-data-mining    | 项目源码 | 文件源码
def tfidf_vectorize(documents, queries=[''],
                    tfidf_vectorizer=TfidfVectorizer(stop_words='english', lowercase=True)):
    """
    vectorize job_descriptions using tfidf

    :param documents: list of text (training_data
    :param queries: list of text (test data) - can be empty [''] (default)
        in case we just want to vectorize a single corpus
    :param tfidf_vectorizer: to overwrite with an existing/trained vectorizer
        or different parameters
    :return: (tfidf_vectorizer, document_vector, queries_vector)
    """

    # easier to test with smaller data set
    # use this to overwrite the incoming corpus/queries
    # documents = ['aaa bbb', 'ccc eee', 'aaa ddd', 'ddd ddd', 'ccc aaa']
    # queries = ['aaa bbb', 'ddd ddd']

    tfidf_vectorizer.fit(documents, queries)
    document_vector = tfidf_vectorizer.transform(documents)
    queries_vector = tfidf_vectorizer.transform(queries)
    return tfidf_vectorizer, document_vector, queries_vector
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency'):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix
项目:tensorflow-quorakaggle    作者:ram1988    | 项目源码 | 文件源码
def __getTFIDFVectors(self, question1, question2):

        question1 = question1.lower()
        question2 = question2.lower()

        question1 = question1 if question1 != "nan" else ""
        question2 = question2 if question2 != "nan" else ""

        question1 = re.sub('\W+', ' ', question1)
        question2 = re.sub('\W+', ' ', question2)

        question1_tokens = question1.split()
        question2_tokens = question2.split()

        vocabulary = question1_tokens + question2_tokens
        vocabulary = list(set(vocabulary))

        vectorizer = TfidfVectorizer(analyzer='word', vocabulary=vocabulary)
        vectorized_q1 = vectorizer.fit_transform([question1])
        vectorized_q2 = vectorizer.transform([question2])

        return vectorized_q1, vectorized_q2
项目:novelRS    作者:nladuo    | 项目源码 | 文件源码
def run(self):
        contents = [self.__read_file(novel['_id'])
                        for novel in self.novels]
        vectorizer = TfidfVectorizer(input="file", stop_words=stop_words, max_features=50000)

        print("start vectorizing...")
        t0 = time()
        # ????
        X = vectorizer.fit_transform(contents)
        print("done in %0.3fs" % (time() - t0))
        with open("dataset.pickle", "w") as f:
            print("saving dataset.....")
            pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)


        # ????
        with open("vectorizer.pickle", "w") as f:
            print("saving vectorizer model.....")
            pickle.dump(vectorizer, f)

        # ?????
        self.__close()
        print("Finished!! All documents has been vectorized.")
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard",
                 generate=True):
        super(MILClassifier, self).__init__()
        self.modelname = modelname
        self.pairtype = pairtype
        self.pairs = {}  # (e1.normalized, e2.normalized) => (e1, e2)
        self.instances = {}  # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities
        self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1)
        self.bag_labels = []  # ordered list of labels for each bag
        self.bag_pairs = []  # ordered list of pair labels (e1.normalized, e2.normalized)
        self.data = []  # ordered list of bags, each is a list of feature vectors
        self.predicted = []  # ordered list of predictions for each bag
        self.resultsfile = None
        self.examplesfile = None
        self.ner_model = ner
        self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b')
        self.corpus = corpus

        #self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=)
        #self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20)
        self.classifier = misvm.sMIL(kernel='linear', C=1)
        #self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20)
        #if generate:
        #    self.generateMILdata(test=test, pairtype=pairtype, relations=relations)
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75,                                
                                max_features=None,
                                norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

    ## char based
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75, 
                                max_features=None, 
                                norm="l2",
                                strip_accents="unicode", 
                                analyzer="char",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram), 
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1, 
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf


# ------------------------ LSA -------------------------------
项目:Machine-Learning-Projects    作者:poke19962008    | 项目源码 | 文件源码
def extractDoc(ext):
    root = 'data'
    data = []
    for f in os.listdir(os.path.join(root, ext))[:5]:
        with open(os.path.join(root, ext, f), 'r') as sc:
            sc = clean(sc.read(), 'cpp')
            data.append(sc)
            print "[SUCCESS] Read", os.path.join(root, ext, f)


    vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,2))
    X = vectorizer.fit_transform(data)
    del data

    features_by_gram = defaultdict(list)
    for f, w in zip(vectorizer.get_feature_names(), vectorizer.idf_):
        features_by_gram[len(f.split(' '))].append((f, w))
    top_n = 50

    for gram, features in features_by_gram.iteritems():
        top_features = sorted(features, key=lambda x: x[1], reverse=True)[:top_n]
        top_features = [f[0] for f in top_features]
        print '{}-gram top:'.format(gram), top_features
项目:kpex    作者:christophfeinauer    | 项目源码 | 文件源码
def extract_terms_with_corpus_sklearn(text_files, number_of_terms=10, max_features=20, max_words=3, lemmatize=True, train_on_script = True):

    # tokenizer
    analyzer = lambda s: extract_chunks(read_txt(s),lemmatize=lemmatize,max_words=max_words)

    # All-in-one object for tfidf calculation
    tfidf_vectorizer = TfidfVectorizer(input='filename', analyzer = analyzer, max_features=max_features)

    # fit training data & get tfidf matrix
    if train_on_script:
        tfidf_mat = tfidf_vectorizer.fit(text_files[0:])
    else: 
        tfidf_mat = tfidf_vectorizer.fit(text_files[1:])

    # transform first file
    tfidf_script = tfidf_vectorizer.transform([text_files[0]])

    # get map between id and term
    id2term = tfidf_vectorizer.get_feature_names()

    return [(id2term[i],tfidf_script[0,i]) for i in tfidf_script.toarray()[0,:].argsort()[::-1][0:number_of_terms]]
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def tfidf(corpus, corpusKeys):
    #TODO clean this up
    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    stopset.extend([str(x) for x in range(9999)])
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = (vectorizer.fit_transform(corpus)).toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    terms = vectorizer.get_feature_names()
    records = []
    for i, comp in enumerate(X):
        termsInComp = zip(terms, comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        records.append((vader(corpusKeys[i], termList), termList))
    return records
项目:2016CCF-SouGou    作者:AbnerYang    | 项目源码 | 文件源码
def GaussianNBPredictModel(localTrainLabel, config):
    train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",")
    test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",")
    print "Train tf-idf vector Model..."
    encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50)
    localTrainFeature = encode.fit_transform(train['qlist'].values)
    localTestFeature = encode.transform(train['qlist'].values)

    print localTrainFeature.shape, localTestFeature.shape

    print 'train...'
    model = GaussianNB()
    model.fit(X = localTrainFeature.toarray(), y = localTrainLabel)
    print 'predict...'
    if config['prob'] == False:
        return model.predict(localTestFeature.toarray()), test['uid'].values
    else:
        return model.predict_log_proba(localTestFeature.toarray()), test['uid'].values

#-- Multinomial Navie Bayes corss validation model frame
项目:2016CCF-SouGou    作者:AbnerYang    | 项目源码 | 文件源码
def MultinomialNBPredictModel(localTrainLabel, config):
    train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",")
    test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",")
    print "Train tf-idf vector Model..."    
    encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50)
    localTrainFeature = encode.fit_transform(train['qlist'].values)
    localTestFeature = encode.transform(train['qlist'].values)

    print localTrainFeature.shape, localTestFeature.shape

    print 'train...'
    model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
    model.fit(X = localTrainFeature, y = localTrainLabel)
    print 'predict...'
    if config['prob'] == False:
        return model.predict(localTestFeature), test['uid'].values
    else:
        return model.predict_log_proba(localTestFeature), test['uid'].values

#-- xgboost local corss validation model frame