Python sklearn.metrics 模块,make_scorer() 实例源码

我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用sklearn.metrics.make_scorer()

项目:playground    作者:Pennsy    | 项目源码 | 文件源码
def learn_decision_tree(data):
    DT = tree.DecisionTreeClassifier(max_depth=7)
    scorer = make_scorer(matthews_corrcoef)
    for i in range(5):
        scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
        print("iteration",i, "dt mean:", scores.mean())
        scores = list(scores)
        print("Decision Tree train scores:\n", scores)
    return DT
    # DT = DT.fit(train_data[:, :-1], train_data[:, -1])
    # predictionsDT = DT.predict(validation_data[:, :-1])

    # validating predicions
    # dtError = 0
    # for i in range(0, len(validation_data)):
    #         if(validation_data[i][20] != predictionsDT[i]):
    #                 dtError = dtError + 1
    # print("DT Error : ", float(dtError)/len(validation_data)*100.0)
项目:Parkinsons-Vocal-Analysis-Model    作者:WilliamY97    | 项目源码 | 文件源码
def fit_model(X, y):

    classifier = svm.SVC()

    parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]}


    f1_scorer = make_scorer(performance_metric,
                                   greater_is_better=True)

    clf = GridSearchCV(classifier,
                       param_grid=parameters,
                       scoring=f1_scorer)

    clf.fit(X, y)

    return clf


# Read student data
项目:SMAC3    作者:automl    | 项目源码 | 文件源码
def rf_from_cfg(cfg, seed):
    """
        Creates a random forest regressor from sklearn and fits the given data on it.
        This is the function-call we try to optimize. Chosen values are stored in
        the configuration (cfg).

        Parameters:
        -----------
        cfg: Configuration
            configuration chosen by smac
        seed: int or RandomState
            used to initialize the rf's random generator

        Returns:
        -----------
        np.mean(rmses): float
            mean of root mean square errors of random-forest test predictions
            per cv-fold
    """
    rfr = RandomForestRegressor(
        n_estimators=cfg["num_trees"],
        criterion=cfg["criterion"],
        min_samples_split=cfg["min_samples_to_split"],
        min_samples_leaf=cfg["min_samples_in_leaf"],
        min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
        max_features=cfg["max_features"],
        max_leaf_nodes=cfg["max_leaf_nodes"],
        bootstrap=cfg["do_bootstrapping"],
        random_state=seed)

    def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))
    # Creating root mean square error for sklearns crossvalidation
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
    return -1 * np.mean(score)  # Because cross_validation sign-flips the score
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:crime_prediction    作者:livenb    | 项目源码 | 文件源码
def build_grid_search(X, y):
    parameters = {
        "estimator__criterion": ['gini', 'entropy'],
        "estimator__max_depth": [10, 15, 20, 25, None],
        "estimator__max_features": ['auto', 'sqrt', 'log2', None]
    }
    ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000,
                                    oob_score=True, n_jobs=-1, verbose=1))
    model_tunning = GridSearchCV(ovr, param_grid=parameters, verbose=1,
                                 n_jobs=-1, cv=10,
                                 scoring=make_scorer(f1_score))
    model_tunning.fit(X, y)
    test_score = model_tunning.best_score_
    print 'The best test score: ', test_score
    y_score = model_tunning.predict_proba(X_test)
    multiclass_roc(y_score, 'grid_search_02')
    return model_tunning
项目:MetaHeuristic    作者:gonzalesMK    | 项目源码 | 文件源码
def __init__(self, name,classifier=None, number_gen=20,
                 verbose=0, repeat=1, parallel=False,
                 make_logbook=False, random_state=None,
                 cv_metric_fuction=make_scorer(matthews_corrcoef), 
                 features_metric_function=None):

        self._name = name
        self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
        self.number_gen = number_gen
        self.verbose = verbose
        self.repeat = repeat
        self.parallel=parallel
        self.make_logbook = make_logbook
        self.random_state = random_state
        self.cv_metric_function= cv_metric_fuction
        self.features_metric_function= features_metric_function
        self._random_object = check_random_state(self.random_state)
        random.seed(self.random_state)
项目:MetaHeuristic    作者:gonzalesMK    | 项目源码 | 文件源码
def __init__(self, name,classifier=None, number_gen=20,
                 verbose=0, repeat=1, parallel=False,
                 make_logbook=False, random_state=None,
                 cv_metric_fuction=make_scorer(matthews_corrcoef), 
                 features_metric_function=None):

        self._name = name
        self.estimator = SVC(kernel='linear', max_iter=10000) if classifier is None else clone(classifier)
        self.number_gen = number_gen
        self.verbose = verbose
        self.repeat = repeat
        self.parallel=parallel
        self.make_logbook = make_logbook
        self.random_state = random_state
        self.cv_metric_function= cv_metric_fuction
        self.features_metric_function= features_metric_function
        self._random_object = check_random_state(self.random_state)
        random.seed(self.random_state)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cross_val_score(reg, X, y, cv=5, scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
项目:TGIF-Release    作者:raingo    | 项目源码 | 文件源码
def main():

    import sys
    import numpy as np
    from sklearn import cross_validation
    from sklearn import svm
    import cPickle

    data_dir = sys.argv[1]

    fet_list = load_list(osp.join(data_dir, 'c3d.list'))
    pos_list = load_list(osp.join(data_dir, 'pos.urls'))

    features = np.load(osp.join(data_dir, 'c3d.npy'))
    fet_set = set(fet_list)

    pos_idx = [fet_list.index(i) for i in pos_list if i in fet_set]

    y = np.zeros(features.shape[0])
    y[pos_idx] = 1

    print 'n_pos', np.sum(y), 'n_neg', np.sum(1 - y)

    params = {'n_estimators':[2, 4, 5, 6, 8, 10, 30]}
    #params = {'n_estimators':[50, 70, 100, 120, 150, 200]}
    clf = grid_search.GridSearchCV(RandomForestClassifier(n_estimators = 2, n_jobs = 4), params, scoring = metrics.make_scorer(lambda yt, yp: metrics.f1_score(yt, yp, pos_label = 0)), cv = 5)
    clf.fit(features, y)
    print clf.best_score_
    print clf.best_estimator_
    cPickle.dump(clf.best_estimator_, open(osp.join(data_dir, 'c3d-models-rfc.pkl'), 'w'))
项目:machine-learning    作者:cinserra    | 项目源码 | 文件源码
def opt_classifier(clf, params, features_train, labels_train, optimize=True):
    '''
    GridSearchCV to find optimal parameters of the classifier.
    '''

    if optimize:
        scorer = make_scorer(f1_score)
        clf = GridSearchCV(clf, params, scoring=scorer)
        clf = clf.fit(features_train, labels_train)
        clf = clf.best_estimator_
    else:
        clf = clf.fit(features_train, labels_train)

    return clf
项目:machine-learning-nanodegree-program-capstone    作者:harrylippy    | 项目源码 | 文件源码
def cross_validate(self):
        clf = self._clf[self._learner]
        (X_train, y_train) = self._train_data

        print " + Cross-validating classifier (learner = %s)..." \
            % self._learner,; stdout.flush()
        scores = cross_val_score(
                        self._clf[self._learner],
                        X_train, y_train,
                        scoring=make_scorer(roc_auc_score),
                        cv=3)
        print "done.\n   * Scores: %r" % scores
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def hierarchical_f_measure_scorer(graph):
    measure = partial(hierarchical_f_measure, graph)
    return make_scorer(measure)
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def make_scoring(scoring):
    """
    Score is reversed if greater_is_better is False.
    """
    if scoring == 'r2':
        return metrics.make_scorer(metrics.r2_score)
    elif scoring == 'mean_absolute_error':
        return metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False)
    elif scoring == 'mean_squared_error':
        return metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)
    elif scoring == 'median_absolute_error':
        return metrics.make_scorer(metrics.median_absolute_error, greater_is_better=False)
    else:
        raise ValueError("Not supported scoring")
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def make_scoring( scoring):
    if scoring == 'r2':
        return make_scorer( metrics.r2_score)
    elif scoring == 'mean_absolute_error':
        return make_scorer( metrics.mean_absolute_error, greater_is_better=False)
    elif scoring == 'mean_squared_error':
        return make_scorer( metrics.mean_squared_error, greater_is_better=False)
    elif scoring == 'median_absolute_error':
        return make_scorer( metrics.median_absolute_error, greater_is_better=False)
    else:
        raise ValueError("Not supported scoring")
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _make_scoring_r0( scoring):
    if scoring == 'r2':
        return metrics.make_scorer( metrics.r2_score)
    elif scoring == 'mean_absolute_error':
        return metrics.make_scorer( metrics.mean_absolute_error, greater_is_better=False)
    elif scoring == 'mean_squared_error':
        return metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False)
    elif scoring == 'median_absolute_error':
        return metrics.make_scorer( metrics.median_absolute_error, greater_is_better=False)
    else:
        raise ValueError("Not supported scoring")
项目:Machine-and-Deep-Learning-Code-Notes    作者:Dvshah13    | 项目源码 | 文件源码
def my_custom_log_loss_func(ground_truth, p_predicitons, penalty=list(), eps=1e-15): # # as a general rule, the first parameter of your function should be the actual answer (ground_truth) and the second should be the predictions or the predicted probabilities (p_predicitons)
    adj_p = np.clip(p_predicitons, eps, 1 - eps)
    lb = LabelBinarizer()
    g = lb.fit_transform(ground_truth)
    if g.shape[1] == 1:
        g = np.append(1 - g, g, axis=1)
    if penalty:
        g[:,penalty] = g[:,penalty] * 2
    summation = np.sum(g * np.log(adj_p))
    return summation * (-1.0/len(ground_truth))

# my_custom_scorer = make_scorer(my_custom_log_loss_func, greater_is_better=False, needs_proba=True, penalty=[4,9]) # here we set the penalty on for highly confusable numbers 4 and 9 (can change it or even leave it empty to check whether the resulting loss will be the same as that of the previous experiment with the sklearn.metrics.log_loss function)
# This new loss function will double log_loss when evaluating the results of the classes of number 4 and 9
项目:CIKM-AnalytiCup-2017-    作者:BILLBEATTHEPEAT    | 项目源码 | 文件源码
def search(X,y):
    rmse = make_scorer(RMSE, greater_is_better = False)

    param_test1 = {'n_estimators':range(150,401,50)}
    gsearch1 = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=30,
                            min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), 
                            param_grid = param_test1, scoring=rmse,cv=5)
    gsearch1.fit(X,y)
    print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
项目:CIKM-AnalytiCup-2017-    作者:BILLBEATTHEPEAT    | 项目源码 | 文件源码
def crossV(model, X, y, folds = 5):

    rmse = make_scorer(RMSE, greater_is_better = False)

    scores = cross_val_score(model, X, y, cv = folds, scoring=rmse, n_jobs = 1)

    print scores
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
    # Smoke test the score
    # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                            cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)
    F1Loss = make_scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_score_func():
    clf = MockClassifier()
    _score_func_args = []

    def score_func(y_test, y_predict):
        _score_func_args.append((y_test, y_predict))
        return 1.0

    with warnings.catch_warnings(record=True):
        scoring = make_scorer(score_func)
        score = cross_val_score(clf, X, y, scoring=scoring)
    assert_array_equal(score, [1.0, 1.0, 1.0])
    assert len(_score_func_args) == 3
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cross_val_score(clf, X, y, scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
    # Smoke test the score
    # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                            cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)
    F1Loss = make_scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_make_scorer():
    # Sanity check on the make_scorer factory function.
    f = lambda *args: 0
    assert_raises(ValueError, make_scorer, f, needs_threshold=True,
                  needs_proba=True)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_raises_on_score_list():
    # Test that when a list of scores is returned, we raise proper errors.
    X, y = make_blobs(random_state=0)
    f1_scorer_no_average = make_scorer(f1_score, average=None)
    clf = DecisionTreeClassifier()
    assert_raises(ValueError, cross_val_score, clf, X, y,
                  scoring=f1_scorer_no_average)
    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
                               param_grid={'max_depth': [1, 2]})
    assert_raises(ValueError, grid_search.fit, X, y)
项目:dstk    作者:jotterbach    | 项目源码 | 文件源码
def fit_cv(self, data, labels, cv_params, epochs=10, **kwargs):

        n_jobs = kwargs.get('n_jobs', 1)
        iid = kwargs.get('iid', True)
        refit = kwargs.get('refit', True)
        cv = kwargs.get('cv', None)
        verbose = kwargs.get('verbose', 0)
        pre_dispatch = kwargs.get('pre_dispatch', '2*n_jobs')
        error_score = kwargs.get('error_score', 'raise')
        return_train_score = kwargs.get('return_train_score', True)

        param_dct = self.get_params()
        param_dct.update({'bootstrap_fraction': 1.0})

        rscv = GridSearchCV(SGDBolasso(**param_dct),
                            scoring=make_scorer(accuracy_score),
                            verbose=verbose,
                            param_grid=cv_params,
                            fit_params={'epochs': 1, 'verbose': 0},
                            cv=cv,
                            return_train_score=return_train_score,
                            n_jobs=n_jobs,
                            iid=iid,
                            refit=refit,
                            pre_dispatch=pre_dispatch,
                            error_score=error_score)

        rscv.fit(data, labels)

        param_dct = rscv.best_params_.copy()
        param_dct.update({'bootstrap_fraction': self.bootstrap_fraction})
        best_estim = SGDBolasso(**param_dct)

        best_estim.fit(data, labels, epochs=epochs)
        return best_estim, rscv
项目:TextCategorization    作者:Y-oHr-N    | 项目源码 | 文件源码
def __grid_search_model(self, clf_factory, documents, labels, pos_label):
        boolndarr        = labels.values == pos_label
        n                = documents.size
        n_pos            = labels[boolndarr].size
        n_neg            = n - n_pos

        param_grid       = {
            'vect__binary'      : [False, True],
            'vect__min_df'      : [1, 2],
            'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
            'vect__smooth_idf'  : [False, True],
            'vect__stop_words'  : [None, 'english'],
            'vect__sublinear_tf': [False, True],
            'vect__use_idf'     : [False, True],
            'clf__alpha'        : [0, 0.01, 0.05, 0.1, 0.5, 1]
        }

        k                = 5
        cv               = ShuffleSplit(
            n,
            n_iter       = k,
            test_size    = 1 / k,
            random_state = 0
        )

        pos_weight       = n_neg / n_pos
        sample_weight    = np.ones(n)
        sample_weight[boolndarr] *= pos_weight
        fit_params       = {'clf__sample_weight': sample_weight}

        f1_scorer        = make_scorer(f1_score, pos_label=pos_label)

        grid_search      = GridSearchCV(
            clf_factory,
            param_grid,
            cv           = cv,
            fit_params   = fit_params,
            n_jobs       = -1,
            scoring      = f1_scorer
        )

        grid_search.fit(documents, labels)
        best_estimator   = grid_search.best_estimator_
        best_score       = grid_search.best_score_
        best_params      = grid_search.best_params_

        print("Best F1 score: {0:04.3f}".format(best_score))
        print("Parameters: {0}".format(best_params))

        return best_estimator
项目:DiscourseSenser    作者:WladimirSidorenko    | 项目源码 | 文件源码
def train(self, a_train_data, a_dev_data=None, a_n_y=-1,
              a_i=-1, a_train_out=None, a_dev_out=None):
        """Method for training the model.

        Args:
          a_train_data (tuple[list, dict]):
            list of training JSON data
          a_dev_data (tuple[list, dict] or None):
            list of development JSON data
          a_n_y (int):
            number of distinct classes
          a_i (int):
            row index for the output predictions
          a_train_out (np.array or None):
            predictions for the training set
          a_dev_out (np.array or None):
            predictions for the training set

        Returns:
          void:

        Note:
          updates ``a_train_out`` and ``a_dev_out`` in place

        """
        self.n_y = a_n_y
        x_train, y_train = self._generate_ts(a_train_data)
        x_dev, y_dev = self._generate_ts(a_dev_data)
        # determine cross-validation and grid-search strategy and fit the model
        if self._gs:
            if a_dev_data is None or not a_dev_data[0]:
                cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True)
            else:
                cv = self._devset_cv(y_train, len(y_dev), NFOLDS)
                x_train = x_train + x_dev
                y_train = y_train + y_dev
            scorer = make_scorer(f1_score, average="macro")
            self._model = GridSearchCV(self._model, self.PARAM_GRID,
                                       scoring=scorer,
                                       cv=cv, n_jobs=self.N_JOBS, verbose=1)
        self._model.fit([el[-1] for el in x_train], y_train)
        # output best hyper-parameters
        if self._gs:
            print("Best params:", repr(self._model.best_params_),
                  file=sys.stderr)
        if a_i >= 0:
            if a_train_out is not None:
                if self._gs and a_dev_data and a_dev_data[0]:
                    x_train = x_train[:-len(x_dev)]
                for i, x_i in x_train:
                    self._predict(x_i, a_train_out[i], a_i)
            if a_dev_out is not None:
                for i, x_i in x_dev:
                    self._predict(x_i, a_dev_out[i], a_i)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def greedy_select_features(self):
        print('initial shapes:', self.train_.shape, self.test_.shape)
        saved = None if self.debug_ else self.load('chosen_features')

        if saved == None:
            g_best_score = 1e9
            g_best_features = []
            current = set()
            finished = False
        else:
            g_best_features, g_best_score, finished = saved
            current = set(g_best_features)
            print('SFS REUSE:', g_best_score, len(current), sorted(g_best_features), self.now())


        if not finished:
            col_names = self.train_.columns
            y = self.y_.ravel()
            scorer = metrics.make_scorer(metrics.log_loss)
            loop_count = len(col_names) - len(g_best_features)
            for _ in range(loop_count):
                avail = set(col_names).difference(current)
                best_score = 1e9
                best_features = None
                for f in avail:
                    newf = list(current | {f})
                    score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer)
                    if best_score > score:
                        best_score = score
                        best_features = newf
                current = set(best_features)
                if g_best_score > best_score:
                    g_best_score = best_score
                    g_best_features = best_features
                    print('new best:', g_best_score, sorted(g_best_features), self.now())
                else:
                    print('no luck', len(current), self.now())
                if len(best_features) - len(g_best_features) >= 5:
                    break
                self.save('chosen_features', (g_best_features, g_best_score, False))
            # now
            self.save('chosen_features', (g_best_features, g_best_score, True))

        print('feature selection complete.', self.now())
        self.train_ = self.train_[g_best_features]
        self.test_ = self.test_[g_best_features]
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def greedy_select_features(self):
        saved = None if self.debug_ else self.load('chosen_features')
        if saved == None:
            print('initial shapes:', self.train_.shape, self.test_.shape)
            num_columns = self.train_.shape[1]
            col_names = [str(c) for c in range(num_columns)]
            self.train_.columns = col_names
            self.test_.columns = col_names

            g_best_score = 1e9
            g_best_features = None

            y = self.y_.ravel()
            current = set()
            scorer = metrics.make_scorer(metrics.log_loss)
            for _ in enumerate(col_names):
                avail = set(col_names).difference(current)
                best_score = 1e9
                best_features = None
                for f in avail:
                    newf = list(current | {f})
                    cv = model_selection.cross_val_score(linear_model.BayesianRidge(),
                                                         self.train_[newf], y,
                                                         cv=self.n_fold_, n_jobs=-2,
                                                         scoring = scorer)
                    score = np.mean(cv)
                    if best_score > score:
                        best_score = score
                        best_features = newf
                current = set(best_features)
                if g_best_score > best_score:
                    g_best_score = best_score
                    g_best_features = best_features
                    print('new best:', g_best_score, g_best_features, self.now())
                if len(best_features) - len(g_best_features) > 15:
                    break
            self.save('chosen_features', (g_best_features, None))
        else:
            g_best_features, _ = saved

        print('feature selection complete.', self.now())
        self.train_ = self.train_[g_best_features]
        self.test_ = self.test_[g_best_features]
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def greedy_select_features(self):
        print('initial shapes:', self.train_.shape, self.test_.shape)
        saved = None if self.debug_ else self.load('chosen_features')

        if saved == None:
            g_best_score = 1e9
            g_best_features = []
            current = set()
            finished = False
        else:
            g_best_features, g_best_score, finished = saved
            current = set(g_best_features)
            print('SFS REUSE:', g_best_score, g_best_features, self.now())

        num_columns = self.train_.shape[1]
        col_names = [str(c) for c in range(num_columns)]
        self.train_.columns = col_names
        self.test_.columns = col_names

        if not finished:
            y = self.y_.ravel()
            scorer = metrics.make_scorer(metrics.log_loss)
            loop_count = len(col_names) - len(g_best_features)
            for _ in range(loop_count):
                avail = set(col_names).difference(current)
                best_score = 1e9
                best_features = None
                for f in avail:
                    newf = list(current | {f})
                    score, _ = self.ccv(linear_model.BayesianRidge(), self.train_[newf], y, scorer)
                    if best_score > score:
                        best_score = score
                        best_features = newf
                current = set(best_features)
                if g_best_score > best_score:
                    g_best_score = best_score
                    g_best_features = best_features
                    print('new best:', g_best_score, g_best_features, self.now())
                if len(best_features) - len(g_best_features) > 5:
                    break
                self.save('chosen_features', (g_best_features, g_best_score, False))
            # now
            self.save('chosen_features', (g_best_features, g_best_score, True))

        print('feature selection complete.', self.now())
        self.train_ = self.train_[g_best_features]
        self.test_ = self.test_[g_best_features]
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = StratifiedKFold(2)
    score_label, _, pvalue_label = permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
                / y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
                / y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5,
                  scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    rfecv.fit(X, y)
    assert_equal(len(rfecv.grid_scores_), 6)
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)