Python sklearn.cross_validation 模块,KFold() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cross_validation.KFold()

项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _cv_r0( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:aueb.twitter.sentiment    作者:nlpaueb    | 项目源码 | 文件源码
def getConfidenceScores(features_train, labels_train, C):
    train_confidence = []
    #confidence scores for training data are computed using K-fold cross validation
    kfold = KFold(features_train.shape[0], n_folds=10)

    for train_index,test_index in kfold:
        X_train, X_test = features_train[train_index], features_train[test_index]
        y_train, y_test = labels_train[train_index], labels_train[test_index]

        #train classifier for the subset of train data
        m = SVM.train(X_train,y_train,c=C,k="linear")

        #predict confidence for test data and append it to list
        conf = m.decision_function(X_test)
        for x in conf:
                train_confidence.append(x)

    return np.array(train_confidence)

#save pos scores
项目:pyglmnet    作者:glm-tools    | 项目源码 | 文件源码
def test_cv():
    """Simple CV check."""
    # XXX: don't use scikit-learn for tests.
    X, y = make_regression()
    cv = KFold(X.shape[0], 5)

    glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
    # check that it returns 5 scores
    scores = cross_val_score(glm_normal, X, y, cv=cv)
    assert_equal(len(scores), 5)

    param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
                  {'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
                                             10, base=np.exp(1))}]
    glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
    glmcv.fit(X, y)
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.lda
        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        try:
            for train, test in kf:
                lda = sklearn.lda.LDA()

                if len(y.shape) == 1 or y.shape[1] == 1:
                    lda.fit(X[train], y[train])
                else:
                    lda = OneVsRestClassifier(lda)
                    lda.fit(X[train], y[train])

                predictions = lda.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
            return accuracy / 10
        except LinAlgError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
        except ValueError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.naive_bayes

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            nb = sklearn.naive_bayes.GaussianNB()

            if len(y.shape) == 1 or y.shape[1] == 1:
                nb.fit(X[train], y[train])
            else:
                nb = OneVsRestClassifier(nb)
                nb.fit(X[train], y[train])

            predictions = nb.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = check_random_state(42)
            tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state)

            if len(y.shape) == 1 or y.shape[1] == 1:
                tree.fit(X[train], y[train])
            else:
                tree = OneVsRestClassifier(tree)
                tree.fit(X[train], y[train])

            predictions = tree.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            random_state = check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                    criterion="entropy", max_depth=1, random_state=random_state,
                    min_samples_split=1, min_samples_leaf=1, max_features=None)
            if len(y.shape) == 1 or y.shape[1] == 1:
                node.fit(X[train], y[train])
            else:
                node = OneVsRestClassifier(node)
                node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.tree

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)
        accuracy = 0.

        for train, test in kf:
            random_state = check_random_state(42)
            node = sklearn.tree.DecisionTreeClassifier(
                    criterion="entropy", max_depth=1, random_state=random_state,
                    min_samples_split=1, min_samples_leaf=1, max_features=1)
            if len(y.shape) == 1 or y.shape[1] == 1:
                node.fit(X[train], y[train])
            else:
                node = OneVsRestClassifier(node)
                node.fit(X[train], y[train])
            predictions = node.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
项目:avito-contest    作者:fmilepe    | 项目源码 | 文件源码
def rede_neural(X, y):
    print("Iniciando treinamento da Rede Neural")

    X2 = normalize(X)

    clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5,
                        learning_rate='constant',tol=1e-8,learning_rate_init=0.0002,
                        early_stopping=True,validation_fraction=0.2)

    kf = KFold(len(y),n_folds=3)
    i = 0
    for train,test in kf:
        start = time.time()
        i = i + 1
        print("Treinamento",i)

        # dividindo dataset em treino e test
        #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
        X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test]

        # fit
        clf.fit(X_train, y_train)
        print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )")
    return clf
项目:KAGGLE_CERVICAL_CANCER_2017    作者:ZFTurbo    | 项目源码 | 文件源码
def run_cross_validation_create_models(cnn, nfolds, submission_version):
    from sklearn.cross_validation import KFold
    files = glob.glob(INPUT_PATH + "*/*.jpg")
    additional_files = glob.glob(INPUT_PATH_ADD + "*/*.jpg")
    kf = KFold(len(files), n_folds=nfolds, shuffle=True, random_state=get_random_state(cnn))
    num_fold = 0
    sum_score = 0
    print('Len of additional files: {}'.format(len(additional_files)))
    for train_index, test_index in kf:
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(train_index))
        print('Split valid: ', len(test_index))

        score = train_single_model(cnn, num_fold, train_index, test_index, files, additional_files, submission_version)
        sum_score += score

    print('Avg loss: {}'.format(sum_score/nfolds))
项目:assignments    作者:iit-cs579    | 项目源码 | 文件源码
def cross_validation_accuracy(clf, X, labels, k):
    """
    Compute the average testing accuracy over k folds of cross-validation. You
    can use sklearn's KFold class here (no random seed, and no shuffling
    needed).

    Params:
      clf......A LogisticRegression classifier.
      X........A csr_matrix of features.
      labels...The true labels for each instance in X
      k........The number of cross-validation folds.

    Returns:
      The average testing accuracy of the classifier
      over each fold of cross-validation.
    """
    ###TODO
    pass
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def make_kfold(target, feature):
    preds = []
    kf = KFold(len(target), n_folds=folds,shuffle=True)
    test_numbers = []
    for trains, tests in kf:
        test_numbers.append(tests)
        pred_list = []
        feature_list = word_vec.fit_transform([dict(Counter(feature[train])) for train in trains])
        target_list = [target[train] for train in trains]
        logreg.fit(feature_list, target_list)
        for test in tests:
            feature_dict = defaultdict(int)
            for f in word_vec.get_feature_names():
                feature_dict[f] = 0
            for key, value in dict(Counter(feature[test])).items():
                if key in feature_dict:
                    feature_dict[key] = value
            pred_list.append(feature_dict)
        preds.append(logreg.predict(word_vec.fit_transform(pred_list)))
    return preds, test_numbers
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def make_kfold(target, feature):
    preds = []
    kf = KFold(len(target), n_folds=folds,shuffle=True)
    test_numbers = []
    for trains, tests in kf:
        test_numbers.append(tests)
        pred_list = []
        feature_list = word_vec.fit_transform([dict(Counter(feature[train])) for train in trains])
        target_list = [target[train] for train in trains]
        logreg.fit(feature_list, target_list)
        for test in tests:
            feature_dict = defaultdict(int)
            for f in word_vec.get_feature_names():
                feature_dict[f] = 0
            for key, value in dict(Counter(feature[test])).items():
                if key in feature_dict:
                    feature_dict[key] = value
            pred_list.append(feature_dict)
        preds.append(logreg.predict(word_vec.fit_transform(pred_list)))
    return preds, test_numbers
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def eval_cv5(model, x, y):
    kf = KFold(len(y), n_folds=5)
    acc = np.array([])
    pre = np.array([])
    rec = np.array([])
    f1 = np.array([])
    for train_index, test_index in kf:
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)
        evaluation = get_eval(prediction, y_test)
        acc = np.append(acc, np.array(evaluation[0]))
        pre = np.append(pre, np.array(evaluation[1]))
        rec = np.append(rec, np.array(evaluation[2]))
        f1 = np.append(f1, np.array(evaluation[3]))

    return acc.mean(), pre.mean(), rec.mean(), f1.mean()
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def __init__(self, estimator_cls, parameter_grid, score_fns,
                 nfolds=10, shuffle=False, seed=None, njobs=1,
                 checkpoint_path=None):
        self.estimator_cls = estimator_cls
        self.parameter_grid = parameter_grid
        self.nfolds = nfolds
        self.seed = seed
        assert njobs == 1, "# jobs > 1 not supported."
        self.njobs = njobs
        assert _is_arraylike(score_fns)
        self.score_fns = score_fns
        self.checkpoint_path = checkpoint_path
        self.grid_scores = None
        self.kf = KFold(n_folds=self.nfolds,
                        shuffle=shuffle,
                        random_state=seed)
项目:FLASH    作者:yuyuz    | 项目源码 | 文件源码
def cached_run(steps, X, y):
    step_identifier = ''

    # split data
    n = len(y)
    kf = KFold(n, _n_fold, random_state=_random_state)
    folded_data = [(X[train_index], y[train_index], X[test_index], y[test_index]) for train_index, test_index in kf]

    # last step is estimator, handle separately
    for step in steps[:-1]:
        step_identifier += "/%s" % _step_identifier(step)
        logger.info("Processing %s", step_identifier)
        folded_data = run_step_on_demand(step_identifier, step, folded_data)

    scores = []
    estimator = steps[-1]
    step_identifier += "/%s" % _step_identifier(estimator)
    for (X_train, y_train, X_test, y_test) in folded_data:
        estimator.fit(X_train, y_train)
        scores.append(estimator.score(X_test, y_test))

    score = np.mean(scores)
    logger.info("score of %s is %r", step_identifier, score)
    return score
项目:nba-games    作者:ixarchakos    | 项目源码 | 文件源码
def k_fold_sample_data_set(x, y, folds):
    """
    This function uses a k-fold approach as a re-sampling strategy
    :param x: numpy array
        - Includes the train data
    :param y: numpy array
        - Includes the actual value of each data sample
    :param folds: integer
        - The number of folds that splits the data set
    :return: list of lists
        - The training and test samples extracted from the training set
    """
    x_train_list, y_train_list, x_test_list, y_test_list = list(), list(), list(), list()
    try:
        kf = KFold(x.shape[0], n_folds=folds, shuffle=True)
        for train_index, test_index in kf:
            x_train_list.append(x[train_index])
            y_train_list.append(y[train_index])
            x_test_list.append(x[test_index])
            y_test_list.append(y[test_index])
        return x_train_list, y_train_list, x_test_list, y_test_list
    except AttributeError as e:
        print(e.args, "- Please, use numpy arrays as inputs")
        exit()
项目:python-rustlearn    作者:maciejkula    | 项目源码 | 文件源码
def run_example():

    data, target = _get_data()

    n_folds = 5
    accuracy = 0.0

    for (train_idx, test_idx) in KFold(n=len(data), n_folds=n_folds, shuffle=True):

        train_X = data[train_idx]
        train_y = target[train_idx]

        test_X = data[test_idx]
        test_y = target[test_idx]

        model = SGDClassifier()
        model.fit(train_X, train_y)

        predictions = model.predict(test_X)

        accuracy += accuracy_score(predictions, test_y)

    return accuracy / n_folds
项目:python-rustlearn    作者:maciejkula    | 项目源码 | 文件源码
def run_example():

    data, target = _get_data()

    n_folds = 5
    accuracy = 0.0

    for (train_idx, test_idx) in KFold(n=len(data), n_folds=n_folds, shuffle=True):

        train_X = data[train_idx]
        train_y = target[train_idx]

        test_X = data[test_idx]
        test_y = target[test_idx]

        model = SGDClassifier()
        model.fit(train_X, train_y)

        predictions = model.predict(test_X)

        accuracy += accuracy_score(predictions, test_y)

    return accuracy / n_folds
项目:ilastik-feature-selection    作者:ilastik    | 项目源码 | 文件源码
def kfold_train_and_predict(X, Y, classifier, k = 5, indices = None, features = None):
    if indices is None:
        indices = np.array(list(range(X.shape[0])))
    if features is None:
        features = np.array(list(range(X.shape[1])))
    kf = cross_validation.KFold(len(indices), n_folds=k)
    accurs = []
    for train, test in kf:
        train_ind = indices[train].astype("int")
        test_ind = indices[test].astype("int")

        classifier.fit(X[train_ind,:][:,features], Y[train_ind])
        accurs += [classifier.score(X[test_ind,:][:,features], Y[test_ind])]

    accurs = np.array(accurs)
    return np.mean(accurs), np.std(accurs)
项目:The_Ultimate_Student_Hunt    作者:analyticsvidhya    | 项目源码 | 文件源码
def run_model(model,dtrain,predictor_var,target,scoring_method='mean_squared_error'):
    cv_method = KFold(len(dtrain),5)
    cv_scores = cross_val_score(model,dtrain[predictor_var],dtrain[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores, np.mean(cv_scores), np.sqrt((-1)*np.mean(cv_scores))

    dtrain_for_val = dtrain[dtrain['Year']<2000]
    dtest_for_val = dtrain[dtrain['Year']>1999]
    #cv_method = KFold(len(dtrain_for_val),5)
    #cv_scores_2 = cross_val_score(model,dtrain_for_val[predictor_var],dtrain_for_val[target],cv=cv_method,scoring=scoring_method)
    #print cv_scores_2, np.mean(cv_scores_2)

    dtrain_for_val_ini = dtrain_for_val[predictor_var]
    dtest_for_val_ini = dtest_for_val[predictor_var]
    model.fit(dtrain_for_val_ini,dtrain_for_val[target])
    pred_for_val = model.predict(dtest_for_val_ini)

    #print math.sqrt(mean_squared_error(dtest_for_val['Footfall'],pred_for_val))
项目:South-African-Heart-Disease-data-analysis-using-python    作者:khushi4tiwari    | 项目源码 | 文件源码
def getTestAndTrainingSet(X,y,K=10):
    N = len(X)

    CV = cross_validation.KFold(N,K,shuffle=True)

    k=0

    for train_index, test_index in CV:

        # extract training and test set for current CV fold
        X_train = X[train_index,:]
        y_train = y[train_index,:]
        X_test = X[test_index,:]
        y_test = y[test_index,:]
        k+=1

        if(k==K):
            return (X_train,y_train),(X_test,y_test)
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def cv(feature_dict, feature, polarity, folds):
    kfold = KFold(len(polarity), n_folds = folds)
    count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0
    for train, test in kfold:
        LR = LogisticRegression()
        count += 1
        x = [(feature[i]) for i in train]
        y = [(polarity[i])for i in train]
        LR.fit(scipy.sparse.vstack(x), (y))

        test_label = []
        answer_label = [(polarity[j]) for j in test]
        for j in test:
            query = feature[j]
            result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query)
            test_label.append(int(result[0]))
        accuracy += accuracy_score(answer_label, test_label)
        precision += precision_score(answer_label, test_label)
        recall += recall_score(answer_label, test_label)
        f1 += f1_score(answer_label, test_label)
        print('{}_fold finished.'.format(count))
    return accuracy, precision, recall, f1
项目:100knock2017    作者:tmu-nlp    | 项目源码 | 文件源码
def cv(feature_dict, feature, polarity, folds):
    kfold = KFold(len(polarity), n_folds = folds)
    count, f1, recall, precision, accuracy = 0, 0, 0, 0, 0
    for train, test in kfold:
        LR = LogisticRegression()
        count += 1
        x = [(feature[i]) for i in train]
        y = [(polarity[i])for i in train]
        LR.fit(scipy.sparse.vstack(x), (y))

        test_label = []
        answer_label = [(polarity[j]) for j in test]
        for j in test:
            query = feature[j]
            result = -1 if query.shape[1] != len(feature_dict) else predict(LR, query)
            test_label.append(result[1][1])
        pre, rec, thr = precision_recall_curve(answer_label, test_label)
        return pre, rec, thr
    return accuracy, precision, recall, f1
项目:hco-experiments    作者:zooniverse    | 项目源码 | 文件源码
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):

    from sklearn.cross_validation import KFold

    m = len(np.squeeze(Y))
    CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5]
    kf = KFold(m, n_folds=n_folds)
    mean_FoMs = []
    for C in CGrid:
        fold = 1
        FoMs = []
        for train, test in kf:
            print("[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold))
            prefix = "cv/cv_fold%d" % fold
            FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \
                                             pooledFile, imageDim, sgd, prefix=prefix)
            FoMs.append(FoM)
            fold += 1
        mean_FoMs.append(np.mean(FoMs))

    best_FoM_index = np.argmin(mean_FoMs)
    print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
    return CGrid[best_FoM_index]
项目:hco-experiments    作者:zooniverse    | 项目源码 | 文件源码
def cross_validate_SoftMaxOnline(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):

    from sklearn.cross_validation import KFold

    m = len(np.squeeze(Y))
    CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
    kf = KFold(m, n_folds=n_folds, indices=False)
    mean_FoMs = []
    for C in CGrid:
        fold = 1
        FoMs = []
        for train, test in kf:
            print("[+] training SoftMaxOnline: LAMBDA : %e, fold : %d" % (C, fold))
            prefix = "cv/cv_fold%d" % fold
            FoM, threshold = train_SoftMaxOnline(C, dataFile, X[train], Y[train], X[test], Y[test], \
                                                 pooledFile, imageDim, sgd, prefix=prefix)
            FoMs.append(FoM)
            fold += 1
        mean_FoMs.append(np.mean(FoMs))

    best_FoM_index = np.argmin(mean_FoMs)
    print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
    return CGrid[best_FoM_index]
项目:hco-experiments    作者:zooniverse    | 项目源码 | 文件源码
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):

    from sklearn.cross_validation import KFold

    m = len(np.squeeze(Y))
    CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
    kf = KFold(m, n_folds=n_folds, indices=False)
    mean_FoMs = []
    for C in CGrid:
        fold = 1
        FoMs = []
        for train, test in kf:
            print("[+] training linear SVM: C : %e, fold : %d" % (C, fold))
            prefix = "cv/cv_fold%d" % fold
            FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \
                                   pooledFile, imageDim, sgd, prefix=prefix)
            FoMs.append(FoM)
            fold += 1
        mean_FoMs.append(np.mean(FoMs))

    best_FoM_index = np.argmin(mean_FoMs)
    print("[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index])
    return CGrid[best_FoM_index]
项目:hco-experiments    作者:zooniverse    | 项目源码 | 文件源码
def cross_validate_Softmax(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):

    from sklearn.cross_validation import KFold

    m = len(np.squeeze(Y))
    CGrid = [0.1, 0.03, 0.01, 0.003, 0.001, 3e-4, 1e-4, 3e-5, 1e-5]
    kf = KFold(m, n_folds=n_folds, indices=False)
    mean_FoMs = []
    for C in CGrid:
        fold = 1
        FoMs = []
        for train, test in kf:
            print "[+] training Softmax: LAMBDA : %e, fold : %d" % (C, fold)
            prefix = "cv/cv_fold%d" % fold
            FoM, threshold = train_Softmax(C, dataFile, X[train], Y[train], X[test], Y[test], \
                                             pooledFile, imageDim, sgd, prefix=prefix)
            FoMs.append(FoM)
            fold += 1
        mean_FoMs.append(np.mean(FoMs))

    best_FoM_index = np.argmin(mean_FoMs)
    print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]
    return CGrid[best_FoM_index]
项目:hco-experiments    作者:zooniverse    | 项目源码 | 文件源码
def cross_validate_linearSVM(dataFile, X, Y, pooledFile, imageDim, sgd, save=True, n_folds=5):

    from sklearn.cross_validation import KFold

    m = len(np.squeeze(Y))
    CGrid = [10, 3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
    kf = KFold(m, n_folds=n_folds, indices=False)
    mean_FoMs = []
    for C in CGrid:
        fold = 1
        FoMs = []
        for train, test in kf:
            print "[+] training linear SVM: C : %e, fold : %d" % (C, fold)
            prefix = "cv/cv_fold%d" % fold
            FoM, threshold = train_linearSVM(C, dataFile, X[train], Y[train], X[test], Y[test], \
                                   pooledFile, imageDim, sgd, prefix=prefix)
            FoMs.append(FoM)
            fold += 1
        mean_FoMs.append(np.mean(FoMs))

    best_FoM_index = np.argmin(mean_FoMs)
    print "[+] Best performing classifier: C : %lf" % CGrid[best_FoM_index]
    return CGrid[best_FoM_index]
项目:ML    作者:saurabhsuman47    | 项目源码 | 文件源码
def knn_cv(post_features, post_class, n_folds, n_neighbors, length_dataset = -1):

    if(length_dataset == -1):
        length_dataset = len(post_class)
    cv = KFold(n = length_dataset, n_folds = n_folds, shuffle = True)
    train_accuracy = []
    test_accuracy = []

    for train,test in cv:
        knn = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors)
        knn.fit(post_features[train],post_class[train])
        train_accuracy.append(knn.score(post_features[train], post_class[train]))
        test_accuracy.append(knn.score(post_features[test], post_class[test]))

#    return (sum(train_accuracy)/n_folds), (sum(test_accuracy)/n_folds)
    return np.mean(train_accuracy), np.mean(test_accuracy)
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def __init__( self, pdr, E_QC = "E_QC", Em = "Em", type_name = "Type", type_l = [1,2,3,4], 
                    disp = False, graph = False):

        # This parameter will be used in the run() function. 
        self.type_l = type_l
        self.disp = disp
        self.graph = graph

        self.xMa = {}
        self.yVa = {}
        # self.kfa = {}
        for type_id in type_l:
            pdr_new = pdr[ pdr[ type_name] == type_id]
            self.xMa[type_id] = np.mat( pdr_new[ E_QC].values).T
            self.yVa[type_id] = np.mat( pdr_new[ Em].values).T
            # kfa[type_id] = cross_validation.KFold( np.shape(yVa[type_id])[0], n_folds=5, shuffle=True)
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _gs_SVC_r0( xM, yVc, params):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_SVC( xM, yVc, params, n_folds = 5):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=True)
    yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv_Ridge_BIKE( A_list, yV, XX = None, alpha = 0.5, n_folds = 5, n_jobs = -1, grid_std = None):

    clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha)
    ln = A_list[0].shape[0] # ls is the number of molecules.
    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)

    AX_idx = np.array([list(range( ln))]).T
    yV_pred = cross_validation.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs)

    print('The prediction output using cross-validation is given by:')
    jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _gs_SVC_r0( xM, yVc, params):
    """
    Since classification is considered, we use yVc which includes digital values 
    whereas yV can include float point values.
    """

    print(xM.shape, yVc.shape)

    clf = svm.SVC()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
    gs = grid_search.GridSearchCV( clf, params, cv = kf5, n_jobs = -1)

    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_folds = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_folds = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, X_concat)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n = cross_validation.KFold( ln, n_folds=n_folds, shuffle=True)
    gs = grid_search.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv( method, xM, yV, alpha, n_folds = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds, shuffle=shuffle)
    yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _cv_LOO_r0( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    n_folds = xM.shape[0]

    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = cross_validation.KFold( xM.shape[0], n_folds=n_folds)
    yV_pred = cross_validation.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:stacking    作者:ikki407    | 项目源码 | 文件源码
def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407):
    try:
        a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed)
        cv_index = a.test_folds
        print 'Done StratifiedKFold'
    except:
        cv_index = np.empty(len(target))
        a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed)
        for idx, i in enumerate(a):
            cv_index[i[1]] = idx
        cv_index = cv_index.astype(int)
        print 'Done Kfold'

    np.save(INPUT_PATH + cv_id_name, cv_index)
    return 

######### Utils #########

#feature list????????????util??
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_kfold_no_shuffle():
    # Manually check that KFold preserves the data ordering on toy datasets
    splits = iter(cval.KFold(4, 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1])
    assert_array_equal(train, [2, 3])

    train, test = next(splits)
    assert_array_equal(test, [2, 3])
    assert_array_equal(train, [0, 1])

    splits = iter(cval.KFold(5, 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 2])
    assert_array_equal(train, [3, 4])

    train, test = next(splits)
    assert_array_equal(test, [3, 4])
    assert_array_equal(train, [0, 1, 2])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = cval.PredefinedSplit(folds)
    for train_ind, test_ind in ps:
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
项目:facerecognition    作者:guoxiaolu    | 项目源码 | 文件源码
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
项目:facerecognition    作者:guoxiaolu    | 项目源码 | 文件源码
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0

        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])

    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def get_kfold_bydate(self, df, n_folds = 10):
        df.sort_values(by = ['time_date','time_id','start_district_id'], axis = 0, inplace = True)
        df.reset_index(drop=True, inplace = True)
        kf = KFold(df.shape[0], n_folds= n_folds, shuffle=False)
        for train_index, test_index in kf:
            print("TRAIN:", train_index, "TEST:", test_index)
        return kf
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
项目:musm-adt17    作者:stefanoteso    | 项目源码 | 文件源码
def crossvalidate(problem, dataset, set_size, uid, w, var, cov,
                  transform, old_alpha, lmbda=0.5):
    """Finds the best hyperparameters using cross-validation.

    Parameters
    ----------
    WRITEME

    Returns
    -------
    alpha : tuple
        The best hyperparameter.
    """

    if len(dataset) % _NUM_FOLDS != 0:
        return old_alpha

    kfold = KFold(len(dataset), n_folds=_NUM_FOLDS)
    f = compute_transform(uid, w, var, cov, transform, lmbda=lmbda)

    avg_accuracy = np.zeros(len(_ALPHAS))
    for i, alpha in enumerate(_ALPHAS):
        accuracies = []
        for tr_indices, ts_indices in kfold:
            w, _ = problem.select_query(dataset[tr_indices], set_size, alpha,
                                        transform=f)
            utilities = np.dot(w, dataset[ts_indices].T)
            accuracies.append((utilities > 0).mean())
        avg_accuracy[i] = sum(accuracies) / len(accuracies)

    alpha = _I_TO_ALPHA[np.argmax(avg_accuracy)]

    _LOG.debug('''\
            alpha accuracies = {avg_accuracy}
            best alpha = {alpha}
        ''', **locals())

    return alpha
项目:text-classification-with-convnets    作者:osmanbaskaya    | 项目源码 | 文件源码
def cross_validate(model, X, y, n_folds, batch_size, num_epoch, func_for_evaluation=None):

    # let's shuffle first.
    seed = 5
    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(y)

    X = np.array(X)
    y = np.array(y)

    scores = np.zeros(n_folds)
    kf = KFold(len(y), n_folds=n_folds)
    for i, (train_index, test_index) in enumerate(kf):
        X_train, y_train = X[train_index, :], y[train_index]
        X_test, y_test = X[test_index, :], y[test_index]
        model.fit(X_train, y_train,
                  batch_size=batch_size,
                  nb_epoch=num_epoch)

        predictions = model.predict(X_test)
        score = func_for_evaluation(predictions[:, 0].tolist(), y_test)
        try:
            scores[i] = score[0]
        except IndexError:
            scores[i] = score


    print "{}-Fold cross validation score: {}".format(n_folds, scores.mean())