Python sklearn.cross_validation 模块,cross_val_score() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cross_validation.cross_val_score()

项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def print_accuracy_report(classifier, X, y, num_validations=5):
    accuracy = cross_validation.cross_val_score(classifier, 
            X, y, scoring='accuracy', cv=num_validations)
    print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"

    f1 = cross_validation.cross_val_score(classifier, 
            X, y, scoring='f1_weighted', cv=num_validations)
    print "F1: " + str(round(100*f1.mean(), 2)) + "%"

    precision = cross_validation.cross_val_score(classifier, 
            X, y, scoring='precision_weighted', cv=num_validations)
    print "Precision: " + str(round(100*precision.mean(), 2)) + "%"

    recall = cross_validation.cross_val_score(classifier, 
            X, y, scoring='recall_weighted', cv=num_validations)
    print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def rfr_feature_select():
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.cross_validation import cross_val_score, ShuffleSplit

    boston = load_boston()
    X = boston["data"]
    Y = boston["target"]
    names = boston["feature_names"]

    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i + 1],
                                Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))

    print sorted(scores, reverse=True)
项目:BotBoosted    作者:brityboy    | 项目源码 | 文件源码
def evaluate_model(model, X_train, y_train):
    '''
    INPUT
         - model: this is a classification model from sklearn
         - X_train: 2d array of the features
         - y_train: 1d array of the target
    OUTPUT
         - information about the model's accuracy using 10
         fold cross validation
         - model: the fit model
    Returns the model
    '''
    print(np.mean(cross_val_score(model, X_train, y_train,
                                  cv=10, n_jobs=-1, verbose=10)))
    model.fit(X_train, y_train)
    return model
项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def print_accuracy_report(classifier, X, y, num_validations=5):
    accuracy = cross_validation.cross_val_score(classifier, 
            X, y, scoring='accuracy', cv=num_validations)
    print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"

    f1 = cross_validation.cross_val_score(classifier, 
            X, y, scoring='f1_weighted', cv=num_validations)
    print "F1: " + str(round(100*f1.mean(), 2)) + "%"

    precision = cross_validation.cross_val_score(classifier, 
            X, y, scoring='precision_weighted', cv=num_validations)
    print "Precision: " + str(round(100*precision.mean(), 2)) + "%"

    recall = cross_validation.cross_val_score(classifier, 
            X, y, scoring='recall_weighted', cv=num_validations)
    print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
项目:ml-talks-duolingo    作者:burrsettles    | 项目源码 | 文件源码
def experiment(model_class, vectorizer, xval):
    name = model_class.__class__.__name__ + '.' + model_class.penalty
    model = model_class.fit(X, y)
    model_weights = vectorizer.inverse_transform(model.coef_)[0]
    with open('weights.%s.txt' % name, 'w') as f:
        f.write('%s\t%f\n' % ('(intercept)', model.intercept_))
        f.writelines('%s\t%f\n' % k for k in model_weights.items())
    acc_scores = cross_validation.cross_val_score(model, X, y, cv=xval)
    auc_scores = cross_validation.cross_val_score(model, X, y, scoring='roc_auc', cv=xval)
    prec_scores = cross_validation.cross_val_score(model, X, y, scoring='precision', cv=xval)
    recall_scores = cross_validation.cross_val_score(model, X, y, scoring='recall', cv=xval)
    f1_scores = cross_validation.cross_val_score(model, X, y, scoring='f1', cv=xval)
    print '-'*80
    print 'acc\t%.4f\t%s' % (np.mean(acc_scores), name)
    print 'auc\t%.4f\t%s' % (np.mean(auc_scores), name)
    print 'prec\t%.4f\t%s' % (np.mean(prec_scores), name)
    print 'recall\t%.4f\t%s' % (np.mean(recall_scores), name)
    print 'f1\t%.4f\t%s' % (np.mean(f1_scores), name)
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def trainLimited(self, featureFile, n_datapoints):
    (label_vector, input_vector) = loadData(featureFile)

    trainData, testData, trainLabels, testLabels = \
      cross_validation.train_test_split(input_vector, label_vector, test_size=(0))

    n_totalrows = int((len(label_vector)/n_datapoints))
    for n in range(0, n_totalrows):
      limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
      limited_input_vector = trainData[0: (n+1) * n_datapoints]

      kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
      kNNClassifier.fit(limited_input_vector, limited_label_vector)

      scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
      print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
项目:The_Ultimate_Student_Hunt    作者:analyticsvidhya    | 项目源码 | 文件源码
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'):
    cv_method = KFold(len(dtrain),5)
    cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores))

    dtrain_for_val = dtrain[dtrain['Year']<2000]
    dtest_for_val = dtrain[dtrain['Year']>1999]
    #cv_method = KFold(len(dtrain_for_val),5)
    #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores_2, np.mean(cv_scores_2)

    dtrain_for_val_ini = dtrain_for_val[predictor_var]
    dtest_for_val_ini = dtest_for_val[predictor_var]
    model.fit(dtrain_for_val_ini,dtrain_for_val[target])
    pred_for_val = model.predict(dtest_for_val_ini)

    #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
项目:dancedeets-monorepo    作者:mikelambert    | 项目源码 | 文件源码
def eval_model(name, model, data):
    print '=' * 20
    print name, 'training'
    model.fit(data, train.target, sample_weight=sample_weights)
    print name, 'trained'

    predictions = model.predict(processed_test_data)
    print name, 'accuracy', np.mean(predictions == test.target)

    print(metrics.classification_report(test.target, predictions))
    print metrics.confusion_matrix(test.target, predictions)

    print name, 'f1 cross validation', cross_validation.cross_val_score(model, grammar_processed_data, train.target, scoring='f1')
    print name, 'precision cross validation', cross_validation.cross_val_score(
        model, grammar_processed_data, train.target, scoring='precision'
    )
    return model, predictions


# SVM need balance on input features, same ranges and variances and stuff like that
项目:svm-text-classification-api    作者:viniciusbo    | 项目源码 | 文件源码
def cross_validation_report(clf, dataset):
  data = count_vectorizer.transform([row[0] for row in dataset])
  target = [row[1] for row in dataset]
  return cross_validation.cross_val_score(clf, data, target)
项目:uda-da-p5-enron-fraud-detection    作者:watanabe8760    | 项目源码 | 文件源码
def evaluate(model, name):
    """
    Evaluates model by cross validation.
    """
    # Get scores through cross validation
    score_f1 = cross_val_score(model, X, y, scoring='f1', cv=splitter_)
    score_pr = cross_val_score(model, X, y, scoring='precision', cv=splitter_)
    score_re = cross_val_score(model, X, y, scoring='recall', cv=splitter_)
    # Save image of score distributions
    save_dist(name, score_f1, score_pr, score_re)
    # Compute mean and std of each score
    result = DataFrame(index=['f1', 'precision', 'recall'],
                       columns=['mean', 'std'])
    result.loc['f1', 'mean'] = np.mean(score_f1)
    result.loc['precision', 'mean'] = np.mean(score_pr)
    result.loc['recall', 'mean'] = np.mean(score_re)
    result.loc['f1', 'std'] = np.std(score_f1)
    result.loc['precision', 'std'] = np.std(score_pr)
    result.loc['recall', 'std'] = np.std(score_re)
    print model
    print result
项目:SMAC3    作者:automl    | 项目源码 | 文件源码
def rf_from_cfg(cfg, seed):
    """
        Creates a random forest regressor from sklearn and fits the given data on it.
        This is the function-call we try to optimize. Chosen values are stored in
        the configuration (cfg).

        Parameters:
        -----------
        cfg: Configuration
            configuration chosen by smac
        seed: int or RandomState
            used to initialize the rf's random generator

        Returns:
        -----------
        np.mean(rmses): float
            mean of root mean square errors of random-forest test predictions
            per cv-fold
    """
    rfr = RandomForestRegressor(
        n_estimators=cfg["num_trees"],
        criterion=cfg["criterion"],
        min_samples_split=cfg["min_samples_to_split"],
        min_samples_leaf=cfg["min_samples_in_leaf"],
        min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
        max_features=cfg["max_features"],
        max_leaf_nodes=cfg["max_leaf_nodes"],
        bootstrap=cfg["do_bootstrapping"],
        random_state=seed)

    def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))
    # Creating root mean square error for sklearns crossvalidation
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
    return -1 * np.mean(score)  # Because cross_validation sign-flips the score
项目:Stock-Prediction-Time-Series-Analysis-Python    作者:Nekooeimehr    | 项目源码 | 文件源码
def Second_Model_KRR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)}
    krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error")
    krr_Tuned.fit(Scaled_Input_Data, Output_Data)
    KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma'])
    KRR_Time = time.time() - T0
    print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time)
    MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_KRR = np.mean(list(MSEs_KRR))
    print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR))
    return(MeanMSE_KRR, krr_Tuned)
项目:BotBoosted    作者:brityboy    | 项目源码 | 文件源码
def evaluate_model(model, X_train, y_train):
    """
    Args:
        model (sklearn classification model): this model from sklearn that
        will be used to fit the data and to see the 10 fold cross val score of
        X_train (2d numpy array): this is the feature matrix
        y_train (1d numpy array): this is the array of targets
    Returns:
        prints information about the model's accuracy using 10
         fold cross validation
        model (sklearn classification model): the model that has already been
        fit to the data
    """
    print(np.mean(cross_val_score(model, X_train, y_train,
                                  cv=10, n_jobs=-1, verbose=10)))
    model.fit(X_train, y_train)
    return model
项目:menrva    作者:amirziai    | 项目源码 | 文件源码
def clf_scores(clf, x_train, y_train, x_test, y_test):
    info = dict()

    # TODO: extend this to a confusion matrix per fold for more flexibility downstream (tuning)
    # TODO: calculate a set of ROC curves per fold instead of running it on test, currently introducing bias
    scores = cross_val_score(clf, x_train, y_train, cv=cv, n_jobs=-1)
    runtime = time()
    clf.fit(x_train, y_train)
    runtime = time() - runtime
    y_test_predicted = clf.predict(x_test)
    info['runtime'] = runtime
    info['accuracy'] = min(scores)
    info['accuracy_test'] = accuracy_score(y_test, y_test_predicted)
    info['accuracy_folds'] = scores
    info['confusion_matrix'] = confusion_matrix(y_test, y_test_predicted)
    clf.fit(x_train, y_train)
    fpr, tpr, _ = roc_curve(y_test, clf_predict_proba(clf, x_test))
    info['fpr'] = fpr
    info['tpr'] = tpr
    info['auc'] = auc(fpr, tpr)

    return info
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    cv_indices = cval.KFold(len(y), 5)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    cv_indices = cval.KFold(len(y), 5)
    cv_masks = []
    for train, test in cv_indices:
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_precomputed():
    # test for svm with precomputed kernel
    svm = SVC(kernel="precomputed")
    iris = load_iris()
    X, y = iris.data, iris.target
    linear_kernel = np.dot(X, X.T)
    score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
    svm = SVC(kernel="linear")
    score_linear = cval.cross_val_score(svm, X, y)
    assert_array_equal(score_precomputed, score_linear)

    # Error raised for non-square X
    svm = SVC(kernel="precomputed")
    assert_raises(ValueError, cval.cross_val_score, svm, X, y)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cval.cross_val_score, svm,
                  linear_kernel.tolist(), y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_with_score_func_classification():
    iris = load_iris()
    clf = SVC(kernel='linear')

    # Default score (should be the accuracy score)
    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # Correct classification score (aka. zero / one score) - should be the
    # same as the default estimator score
    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="accuracy", cv=5)
    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # F1 score (class are balanced so f1_score should be equal to zero/one
    # score
    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="f1_weighted", cv=5)
    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
项目:2016CCF-unicom    作者:xuguanggen    | 项目源码 | 文件源码
def run():
    tr_data = np.loadtxt('../new/TRAIN_LRFORMAT.txt')
    te_data = np.loadtxt('../new/TEST_LRFORMAT.txt')

    tr_x = tr_data[:,1:]
    tr_y = tr_data[:,0]
    te_x = te_data[:,1:]

    lr = LogisticRegression(
            solver='liblinear',
            multi_class='ovr',
            class_weight='balanced',
            penalty='l2',
            n_jobs=-1)
    #te_pred = lr.predict_proba(te_x)
    cv = 10
    scores = cross_val_score(lr,tr_x,tr_y,cv=cv,scoring='accuracy')
    print(str(scores))
    #np.savetxt('result/te_lr.txt',te_pred)
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def rmse_cv(model, X, y):
     return (cross_val_score(model, X, y, scoring=scorer)).mean()
项目:Kaggle    作者:lawlite19    | 项目源码 | 文件源码
def baseline_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    #print u"?????\n",train_data.info()
    #print u'?????\n',train_data.describe()  
    #display_data(train_data)  # ????????
    #display_with_process(train_data) # ??????????????????,????
    process_data = pre_processData(train_data,'process_train_data')  # ????????????
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
    #=cv_error.to_csv(r'error.csv',index=True)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])

    '''??????'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data')  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)


# baseline?SVM??——0.78947
项目:Kaggle    作者:lawlite19    | 项目源码 | 文件源码
def baseline_logisticRegression_crossValidate():
    origin_train_data = pd.read_csv(r"data/train.csv")
    process_data = fe_preprocessData(origin_train_data,'process_train_data')  # ????????????
    process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
    train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X_train = train_np[:,1:]
    y_train = train_np[:,0]
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})

    cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    cv_np = cv_data.as_matrix()
    X_cv = cv_np[:,1:]
    y_cv = cv_np[:,0]
    predictions = model.predict(X_cv)
    print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])

    '''?????????????????'''
    error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
    predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
    predictions_item.columns=['error_PassengerId']
    error_result = pd.concat([error_items,predictions_item],axis=1)
    error_result.to_csv(r'error.csv',index=False)

    #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])    

    '''??????'''
    '''test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True)  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)
项目:Kaggle    作者:lawlite19    | 项目源码 | 文件源码
def optimize_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    print u"?????\n",train_data.info()
    print u'?????\n',train_data.describe()  
    #display_data(train_data)  # ????????
    #display_with_process(train_data) # ??????????????????,????
    process_data = fe_preprocessData(train_data,'process_train_data')  # ????????????
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # ???????????
    train_np = train_data.as_matrix()  # ????
    '''??model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})

    '''??????'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data')  # ?????
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)    
## ????????
项目:TGIF-Release    作者:raingo    | 项目源码 | 文件源码
def stump(X, y):
    score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision')
    clf = LinearSVC()
    clf.fit(X, y)
    coef = clf.coef_[0,0]
    inter = clf.intercept_[0]
    return np.mean(score), np.sign(coef), inter / np.abs(coef)
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def run_croos_validation(self):
        features,labels,cv = self.getFeaturesLabel()
        scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1)
        print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores))
        return -np.absolute(scores.mean())
项目:bguFinalProject    作者:liranfar    | 项目源码 | 文件源码
def build_random_forest_model(x_train, y_train):

    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(x_train, y_train.ravel())
    print "10-fold Cross validation score is :"
    print np.mean(cross_val_score(rf_model, x_train, y_train, cv=10))
    return rf_model
项目:-Classification-on-Chinese-Magazine-    作者:lixiaosi33    | 项目源码 | 文件源码
def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))
项目:ml-projects    作者:saopayne    | 项目源码 | 文件源码
def hackathon_GBC_model(clf, train, features):
    clf.fit(train[features], train["Class"])
    probab_of_predict = clf.predict_proba(train[features])[:,1]
    predict_train = clf.predict(train[features])
    cv_score = cross_val_score(clf, train[features], train["Class"], cv=5, scoring="roc_auc")
    print("----------------------Model performance-----------------------")
    print("Accuracy score: ", accuracy_score(train["Class"].values, predict_train))
    print("AUC: ", roc_auc_score(train["Class"],probab_of_predict) )
    print("CV score: Mean - {}, Max - {}, Min - {}, Std - {}".format(np.mean(cv_score), np.max(cv_score),
                                                                     np.min(cv_score), np.std(cv_score)))

    Relative_Feature_importance = pd.Series(clf.feature_importances_, features).sort_values(ascending=False)
    Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance')
    plt.ylabel('Feature Importance')
    plt.show()
项目:Movie-Success-Predictor    作者:Blueteak    | 项目源码 | 文件源码
def print_metrics(clf):

    #scores = cross_validation.cross_val_score(clf,features,labels,cv=5,scoring='accuracy')
    #print 'Accuracy:',scores.mean()

    cv = cross_validation.StratifiedKFold(labels,n_folds=5)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    all_tpr = []

    for i, (train,test) in enumerate(cv):
        probas_ = clf.fit(features[train],labels[train]).predict_proba(features[test])

        fpr,tpr,thresholds = metrics.roc_curve(labels[test],probas_[:,1])
        mean_tpr += interp(mean_fpr,fpr,tpr)
        mean_tpr[0] = 0.0
        roc_auc = metrics.auc(fpr,tpr)

        plt.plot(fpr,tpr,lw=1,label='ROC fold %d (area = %0.2f)' % (i,roc_auc))

    plt.plot([0,1],[0,1],'--',color=(0.6,0.6,0.6),label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('auc_sent.png')
项目:ml-talks-duolingo    作者:burrsettles    | 项目源码 | 文件源码
def experiment(model_class, vectorizer, xval):
    name = model_class.__class__.__name__
    model = model_class.fit(X, y)
    model_weights = vectorizer.inverse_transform(model.coef_)[0]
    with open('weights.%s.txt' % name, 'w') as f:
        f.write('%s\t%f\n' % ('(intercept)', model.intercept_))
        f.writelines('%s\t%f\n' % k for k in model_weights.items())
    r2_scores = cross_validation.cross_val_score(model, X, y, scoring='r2', cv=xval)
    mae_scores = cross_validation.cross_val_score(model, X, y, scoring='mean_absolute_error', cv=xval)
    print '-'*80
    print 'r2\t%.4f\t%s' % (np.mean(r2_scores), name)
    print 'mae\t%.4f\t%s' % (np.mean(mae_scores), name)
项目:coursera-machine-learning-yandex    作者:dstarcev    | 项目源码 | 文件源码
def calculate(X, y):
    best_p, best_score = 0, -float('inf')
    kf = KFold(len(y), n_folds=5, shuffle=True, random_state=42)
    for p in numpy.linspace(1, 10, num=200):
        knr = KNeighborsRegressor(n_neighbors=5, weights='distance', p=p)
        score = max(cross_val_score(knr, X, y, cv=kf, scoring='mean_squared_error'))
        if score > best_score:
            best_score = score
            best_p = p

    return best_p, best_score
项目:coursera-machine-learning-yandex    作者:dstarcev    | 项目源码 | 文件源码
def calculate(X, y):
    kf = KFold(len(data), n_folds=5, shuffle=True, random_state=42)
    best_k, best_score = 0, 0
    for k in xrange(1, 51):
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, X, y, cv=kf, scoring='accuracy').mean()
        if score > best_score:
            best_score = score
            best_k = k
    return best_k, best_score
项目:coursera-machine-learning-yandex    作者:dstarcev    | 项目源码 | 文件源码
def calculate(X, y, threshold):
    best_t, best_score = 0, -float('inf')
    kf = KFold(len(y), n_folds=5, random_state=1, shuffle=True)
    for t in xrange(1, 51):
        clf = RandomForestRegressor(n_estimators=t, random_state=1)
        score = np.mean(cross_val_score(clf, X, y, cv=kf, scoring='r2'))
        if score > threshold:
            return t
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    # We use logistic regression because it is very fast.
    # Feel free to experiment with other classifiers
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean()
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def regression_with_GBR(X_train, y_train, X_test, y_test, parmsFromNormalization, params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}):
        #GradientBoostingRegressor
    gfr = GradientBoostingRegressor(**params)
    gfr.fit(X_train, y_train)
    y_pred_gbr = gfr.predict(X_test)
    print_regression_model_summary("GBR", y_test, y_pred_gbr, parmsFromNormalization)
    print_feature_importance(X_test, y_test,gfr.feature_importances_)

    #cross validation ( not sure this make sense for regression
    #http://scikit-learn.org/stable/modules/cross_validation.html
    #gfr = GradientBoostingRegressor(**params)
    #scores = cross_validation.cross_val_score(gfr, X_train, y_train, cv=5)
    #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return y_pred_gbr
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def crossValidateModel(self):
    (label_vector, input_vector) = loadData(self.featureFile)
    kFold = 5

    kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
    scores = cross_validation.cross_val_score(kNNClassifier, input_vector, label_vector, cv = kFold)

    print("\n----- k-fold Cross Validation -----")
    print(scores)
    print("Average: ", sum(scores) / len(scores))
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def cv(self, estimator_params):

                if self.ptypes != None:

                        if self.ptypes == 'int':
                                for key in estimator_params.keys():
                                        estimator_params[key] = int(estimator_params[key])

                        else:
                                for key in self.ptypes.keys():
                                        estimator_params[key] = self.ptypes[key](estimator_params[key])

                if self.pfixed != None:
                        for key in self.pfixed.keys():
                                estimator_params[key] = self.pfixed[key]

                if self.plist != None:
                        for key in self.plist.keys():
                                estimator_params[key] = self.plist[key][int(estimator_params[key]) - 1]


                self.estimator.set_params(**estimator_params)
                v = self.estimator.evaluate(self.cv_params['X'])
                return v

#               self.cv_params['estimator'] = estim


#               cvscore = cross_val_score(**self.cv_params)
#               return numpy.mean(cvscore)

        # --------------------------------------------- // --------------------------------------------- #
项目:pines    作者:dmitru    | 项目源码 | 文件源码
def test_iris(self):
        dataset = load_iris()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        print('iris: tree_type: {}, score = {}'.format(self.tree_type, score))
        self.assertTrue(score > 0.8)
项目:pines    作者:dmitru    | 项目源码 | 文件源码
def test_breast_cancer(self):
        dataset = load_breast_cancer()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score))
        self.assertTrue(score > 0.8)
项目:pines    作者:dmitru    | 项目源码 | 文件源码
def test_iris(self):
        dataset = load_iris()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        self.assertTrue(score > 0.8)
        print('iris: tree_type: {}, score = {}'.format(self.tree_type, score))
项目:pines    作者:dmitru    | 项目源码 | 文件源码
def test_breast_cancer(self):
        dataset = load_breast_cancer()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        self.assertTrue(score > 0.8)
        print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score))
项目:digit-ocr    作者:Nozdi    | 项目源码 | 文件源码
def cv(model, X, y, n_iter=5, test_size=0.3):
    split = cross_validation.ShuffleSplit(
        len(X), n_iter=n_iter, test_size=test_size,
    )
    return cross_validation.cross_val_score(model, X, y, cv=split,
                                            scoring='accuracy', n_jobs=-1)
项目:sentiment-analysis    作者:saber1988    | 项目源码 | 文件源码
def random_forest_classify(my_train_data, my_train_label, my_test_data, estimators):
    clf = RandomForestClassifier(n_estimators=estimators)
    scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5)
    print("random forest(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2))
    clf.fit(my_train_data, my_train_label)
    my_test_label = clf.predict(my_test_data)
    file_name = "random_forest_%d.csv" % estimators
    save_data(my_test_label, file_name)
项目:sentiment-analysis    作者:saber1988    | 项目源码 | 文件源码
def gradient_boosting_classify(my_train_data, my_train_label, my_test_data, estimators):
    clf = GradientBoostingClassifier(n_estimators=estimators)
    scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5)
    print("gradient boosting(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2))
    clf.fit(my_train_data, my_train_label)
    my_test_label = clf.predict(my_test_data)
    file_name = "gradient_boosting_%d.csv" % estimators
    save_data(my_test_label, file_name)
项目:sentiment-analysis    作者:saber1988    | 项目源码 | 文件源码
def svc_classify(my_train_data, my_train_label, my_test_data, svc_c):
    # clf = svm.SVC(C=svc_c, kernel='poly')
    clf = svm.SVC(C=svc_c)
    scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5)
    print("svc(C=%.1f) accuracy: %0.3f (+/- %0.3f)" % (svc_c, scores.mean(), scores.std() * 2))
    clf.fit(my_train_data, my_train_label)
    my_test_label = clf.predict(my_test_data)
    file_name = "svc_%.1f.csv" % svc_c
    save_data(my_test_label, file_name)
项目:machine-learning-nanodegree-program-capstone    作者:harrylippy    | 项目源码 | 文件源码
def cross_validate(self):
        clf = self._clf[self._learner]
        (X_train, y_train) = self._train_data

        print " + Cross-validating classifier (learner = %s)..." \
            % self._learner,; stdout.flush()
        scores = cross_val_score(
                        self._clf[self._learner],
                        X_train, y_train,
                        scoring=make_scorer(roc_auc_score),
                        cv=3)
        print "done.\n   * Scores: %r" % scores
项目:Stock-Prediction-Time-Series-Analysis-Python    作者:Nekooeimehr    | 项目源码 | 文件源码
def First_Model_SVR(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    Grid_Dict = {"C": [1e-2, 1e-1,1e0, 1e1, 1e2],"gamma": np.logspace(-4, 2, 6)}
    svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol = 0.005), cv=5,param_grid=Grid_Dict, scoring="mean_absolute_error")
    svr_Tuned.fit(Scaled_Input_Data, Output_Data)
    SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol = 0.01)
    SVR_Time = time.time() - T0
    print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time)
    MSEs_SVR = cross_validation.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_SVR = np.mean(list(MSEs_SVR))
    print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', (-1*MeanMSE_SVR))
    return(MeanMSE_SVR, svr_Tuned)
项目:Stock-Prediction-Time-Series-Analysis-Python    作者:Nekooeimehr    | 项目源码 | 文件源码
def RF_Model(Scaled_Input_Data, Output_Data):
    T0 = time.time()
    n = len(Scaled_Input_Data)
    RFModel = RandomForestRegressor()
    RFModel.fit(Scaled_Input_Data, Output_Data)
    RF_Time = time.time() - T0
    print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time)
    MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error")
    MeanMSE_RF = np.mean(list(MSEs_RF))
    print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF))
    return(MeanMSE_RF, RFModel)
项目:neural_reaction_fingerprint    作者:jnwei    | 项目源码 | 文件源码
def hyperopt_train_test(params):
    clf = rxn_estimator(np.float32(params[0]), np.float32(params[1]), np.int(params[2]), other_param_dict)
    return cross_val_score(clf, X, y, cv=3).mean()