Python sklearn.model_selection 模块,KFold() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.KFold()

项目:VAE-MF-TensorFlow    作者:arongdari    | 项目源码 | 文件源码
def cross_validation():
    M = read_dataset()
    n_fold = 10

    rating_idx = np.array(M.nonzero()).T
    kf = KFold(n_splits=n_fold, random_state=0)

    with tf.Session() as sess:
        model = VAEMF(sess, num_user, num_item,
                      hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim,
                      latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param)

        for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)):
            print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i,
                                                                             n_fold, train_idx.size, test_idx.size))
            model.train(M, train_idx=train_idx,
                        test_idx=test_idx, n_steps=n_steps)
项目:VAE-MF-TensorFlow    作者:arongdari    | 项目源码 | 文件源码
def cross_validation():
    M = read_dataset()
    n_fold = 10

    rating_idx = np.array(M.nonzero()).T
    kf = KFold(n_splits=n_fold, random_state=0)

    with tf.Session() as sess:
        model = VAEMF(sess, num_user, num_item,
                      hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim,
                      latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param, one_hot=one_hot)

        for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)):
            print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i,
                                                                             n_fold, train_idx.size, test_idx.size))
            model.train(M, train_idx=train_idx,
                        test_idx=test_idx, n_steps=n_steps)
项目:geocoder-ie    作者:devgateway    | 项目源码 | 文件源码
def kfold_train(self, n_splits=3):
        logger.info('train classifier using kFold')
        kf = KFold(n_splits=n_splits, shuffle=True)
        scores = []
        precisions = []
        recalls = []
        for train_index, test_index in kf.split(self.data):
            train_text = self.data.iloc[train_index]['text'].values
            train_y = self.data.iloc[train_index]['class'].values

            test_text = self.data.iloc[test_index]['text'].values
            test_y = self.data.iloc[test_index]['class'].values

            self.cls.train(train_text, train_y)
            predictions = self.cls.predict(test_text)
            self.confusion += confusion_matrix(test_y, predictions)
            scores.append(f1_score(test_y, predictions, pos_label='geography'))
            recalls.append(recall_score(test_y, predictions, pos_label='geography'))
            precisions.append(precision_score(test_y, predictions, pos_label='geography'))

        self.score = sum(scores) / len(scores)
        self.precision = sum(precisions) / len(precisions)
        self.recall = sum(recalls) / len(recalls)

        return self.cls
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
        self.name = name
        self.X = X
        self.y = y
        self.task = task
        self.random_state = random_state
        if test_size is not None:
            self.test_size = test_size
            self.validation_method = "train_test_split"
            self.X_train, self.X_test, self.y_train, self.y_test = \
                model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        elif cv is not None:
            self.validation_method = "cv"
            if task == "regression":
                self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
            elif task == "classification":
                self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
项目:Graph-CNN    作者:fps7806    | 项目源码 | 文件源码
def set_kfold(self, no_folds = 10, fold_id = 0):
        inst = KFold(n_splits = no_folds, shuffle=True, random_state=125)
        self.fold_id = fold_id

        self.KFolds = list(inst.split(np.arange(self.no_samples)))
        self.train_idx, self.test_idx = self.KFolds[fold_id]
        self.no_samples_train = self.train_idx.shape[0]
        self.no_samples_test = self.test_idx.shape[0]
        self.print_ext('Data ready. no_samples_train:', self.no_samples_train, 'no_samples_test:', self.no_samples_test)

        if self.train_batch_size == 0:
            self.train_batch_size = self.no_samples_train
        if self.test_batch_size == 0:
            self.test_batch_size = self.no_samples_test
        self.train_batch_size = min(self.train_batch_size, self.no_samples_train)
        self.test_batch_size = min(self.test_batch_size, self.no_samples_test)

    # This function is cropped before batch
    # Slice each sample to improve performance
项目:KAGGLE_CERVICAL_CANCER_2017    作者:ZFTurbo    | 项目源码 | 文件源码
def run_cross_validation_create_models_unet2(nfolds=5):
    from sklearn.model_selection import KFold
    files_full = glob.glob(INPUT_PATH + "*/*.png")
    files = []
    for f in files_full:
        if '_mask' not in f:
            continue
        files.append(f)

    kf = KFold(n_splits=nfolds, shuffle=True, random_state=66)
    num_fold = 0
    sum_score = 0
    for train_index, test_index in kf.split(range(len(files))):
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(train_index))
        print('Split valid: ', len(test_index))
        if num_fold != 2:
            continue
        score = train_single_model(num_fold, train_index, test_index, files)
        sum_score += score

    print('Avg loss: {}'.format(sum_score/nfolds))
项目:KAGGLE_CERVICAL_CANCER_2017    作者:ZFTurbo    | 项目源码 | 文件源码
def run_cross_validation_create_models_unet1(nfolds=5):
    from sklearn.model_selection import KFold
    files_full = glob.glob(INPUT_PATH + "*/*.png")
    files = []
    for f in files_full:
        if '_mask' in f:
            continue
        files.append(f)

    kf = KFold(n_splits=nfolds, shuffle=True, random_state=66)
    num_fold = 0
    sum_score = 0
    for train_index, test_index in kf.split(range(len(files))):
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(train_index))
        print('Split valid: ', len(test_index))
        score = train_single_model(num_fold, train_index, test_index, files)
        sum_score += score

    print('Avg loss: {}'.format(sum_score/nfolds))
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def model_cross_valid(X,Y):
    seed = 7
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    def bulid_model(model_name):
        model = model_name()
        return model
    scoring = 'neg_mean_squared_error'
    # + random fest boost lstm gbdt

    for model_name in [LinearRegression,ElasticNet]:
    #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
        model = bulid_model(model_name)
        results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        print(model_name,results.mean())
项目:CCIT    作者:rajatsen91    | 项目源码 | 文件源码
def cross_validate(classifier, n_folds = 5):
    '''Custom cross-validation module I always use '''
    train_X = classifier['train_X']
    train_y = classifier['train_y']
    model = classifier['model']
    score = 0.0

    skf = KFold(n_splits = n_folds)
    for train_index, test_index in skf.split(train_X):
        X_train, X_test = train_X[train_index], train_X[test_index]
        y_train, y_test = train_y[train_index], train_y[test_index]
        clf = model.fit(X_train,y_train)
        pred = clf.predict_proba(X_test)[:,1]
        #print 'cross', roc_auc_score(y_test,pred)
        score = score + roc_auc_score(y_test,pred)

    return score/n_folds
项目:tensorflow_kaggle_house_price    作者:Cuongvn08    | 项目源码 | 文件源码
def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=15)

        # train cloned base models then create out-of-fold predictions that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred

        # now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    # do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
项目:simple-linear-regression    作者:williamd4112    | 项目源码 | 文件源码
def train_cross_validation(args, sess, model, phi_xs_train, ys_train):
    kf = KFold(n_splits=args.K)

    w_best = None
    validation_loss = 0

    for train_index, validation_index in kf.split(phi_xs_train):
        sess.run(tf.global_variables_initializer())

        model.fit(sess, phi_xs_train[train_index], ys_train[train_index], epoch=args.epoch, batch_size=args.batch_size)
        loss = model.eval(sess, phi_xs_train[validation_index], ys_train[validation_index])

        logging.info('Validation loss = %f' % (loss))
        validation_loss += loss

        model.reset(sess)

    return validation_loss / float(args.K)
项目:GAKeras    作者:PetraVidnerova    | 项目源码 | 文件源码
def evaluate(self, individual):
        #print(" *** evaluate *** ")

        #model = individual.createNetwork()
        #return random.random(), 

        random.seed(42) 
        # perform KFold crossvalidation 
        kf = KFold(n_splits=3)
        scores = []
        for train, test in kf.split(self.X):   # train, test are indicies 
            X_train, X_test = self.X[train], self.X[test]
            y_train, y_test = self.y[train], self.y[test]

            model = individual.createNetwork()
            model.fit(X_train, y_train,
                      batch_size=Config.batch_size, nb_epoch=Config.epochs, verbose=0)

            yy_test = model.predict(X_test)
            scores.append(error(y_test, yy_test))

        fitness = np.mean(scores)

        return fitness,
项目:heamy    作者:rushter    | 项目源码 | 文件源码
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index
项目:convneuralnetwork    作者:clutariomark    | 项目源码 | 文件源码
def do_kfold(proc_images, proc_labels, split=10):
    trainimages = []
    trainlabels = []
    testimages = []
    testlabels = []
    rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
    proc_images = proc_images[rand_idx]
    proc_labels = proc_labels[rand_idx]
    kf = KFold(n_splits=split)
    for train_index, test_index in kf.split(proc_images):
        x_train, x_test = proc_images[train_index], proc_images[test_index]
        y_train, y_test = proc_labels[train_index], proc_labels[test_index]
        trainimages.append(x_train)
        testimages.append(x_test)
        trainlabels.append(y_train)
        testlabels.append(y_test)

    np.save("trainimages.npy", trainimages)
    np.save("testimages.npy", testimages)
    np.save("trainlabels.npy", trainlabels)
    np.save("testlabels.npy", testlabels)
    return(trainimages, testimages, trainlabels, testlabels)
项目:convneuralnetwork    作者:clutariomark    | 项目源码 | 文件源码
def do_kfold(proc_images, proc_labels, split=10):
    trainimages = []
    trainlabels = []
    testimages = []
    testlabels = []
    rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
    proc_images = proc_images[rand_idx]
    proc_labels = proc_labels[rand_idx]
    kf = KFold(n_splits=split)
    for train_index, test_index in kf.split(proc_images):
        x_train, x_test = proc_images[train_index], proc_images[test_index]
        y_train, y_test = proc_labels[train_index], proc_labels[test_index]
        trainimages.append(x_train)
        testimages.append(x_test)
        trainlabels.append(y_train)
        testlabels.append(y_test)

    np.save("trainimages.npy", trainimages)
    np.save("testimages.npy", testimages)
    np.save("trainlabels.npy", trainlabels)
    np.save("testlabels.npy", testlabels)
    return(trainimages, testimages, trainlabels, testlabels)
项目:convneuralnetwork    作者:clutariomark    | 项目源码 | 文件源码
def do_kfold(proc_images, proc_labels, split=10):
    trainimages = []
    trainlabels = []
    testimages = []
    testlabels = []
    rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
    proc_images = proc_images[rand_idx]
    proc_labels = proc_labels[rand_idx]
    kf = KFold(n_splits=split)
    for train_index, test_index in kf.split(proc_images):
        x_train, x_test = proc_images[train_index], proc_images[test_index]
        y_train, y_test = proc_labels[train_index], proc_labels[test_index]
        trainimages.append(x_train)
        testimages.append(x_test)
        trainlabels.append(y_train)
        testlabels.append(y_test)

    np.save("trainimages.npy", trainimages)
    np.save("testimages.npy", testimages)
    np.save("trainlabels.npy", trainlabels)
    np.save("testlabels.npy", testlabels)
    return(trainimages, testimages, trainlabels, testlabels)
项目:convneuralnetwork    作者:clutariomark    | 项目源码 | 文件源码
def do_kfold(proc_images, proc_labels, split=10):
    trainimages = []
    trainlabels = []
    testimages = []
    testlabels = []
    rand_idx = random.sample(range(0, len(proc_images)), len(proc_images))
    proc_images = proc_images[rand_idx]
    proc_labels = proc_labels[rand_idx]
    kf = KFold(n_splits=split)
    for train_index, test_index in kf.split(proc_images):
        x_train, x_test = proc_images[train_index], proc_images[test_index]
        y_train, y_test = proc_labels[train_index], proc_labels[test_index]
        trainimages.append(x_train)
        testimages.append(x_test)
        trainlabels.append(y_train)
        testlabels.append(y_test)

    np.save("trainimages.npy", trainimages)
    np.save("testimages.npy", testimages)
    np.save("trainlabels.npy", trainlabels)
    np.save("testlabels.npy", testlabels)
    return(trainimages, testimages, trainlabels, testlabels)
项目:twitter-text-classification    作者:FurkanArslan    | 项目源码 | 文件源码
def testClassificationQuality(self):
        score = 0
        kfold = KFold(n_splits=10, shuffle=True, random_state=0)

        tweetClassification = TweetClassification()

        for ind_train, ind_test in kfold.split(self.tweets):
            dataTest = self.tweets[ind_test]
            dataTrain = self.tweets[ind_train]
            targetTest = self.target[ind_test]
            targetTrain = self.target[ind_train]

            tweetClassification.fit(dataTrain, targetTrain)

            score += tweetClassification.score(dataTest, targetTest)

        return score / 10
项目:twitter-text-classification    作者:FurkanArslan    | 项目源码 | 文件源码
def evaluate_cross_validation(self, clf, data, target, cluster):
        score = 0
        kfold = KFold(n_splits=cluster, shuffle=True, random_state=0)

        for ind_train, ind_test in kfold.split(data):
            dataTest  = data[ind_test]
            dataTrain = data[ind_train]
            targetTest = target[ind_test]
            targetTrain = target[ind_train]

            clf.fit(dataTrain, targetTrain)

            score += clf.score(dataTest, targetTest)

        print ('-'*30)
        print ("Mean score: %0.3f" % (score/10))
        print ('-'*30)

        return score/10
项目:muffnn    作者:civisanalytics    | 项目源码 | 文件源码
def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96)
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
    """
    Grid search method with numpy array of X and Y
    Previously, np.mat are used for compatible with Matlab notation.    
    """
    if disp:
        print( X.shape, Y.shape)

    clf = getattr( linear_model, method)()
    parmas = {'alpha': np.logspace( *alphas_log)}
    kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf5 = kf5_c.split( X)
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)

    gs.fit( X, Y)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = svm.SVR( **svr_params)
    kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
    kf_n = kf5_ext_c.split( xM)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1):
    """
    gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1)

    Inputs
    ======
    classifier = svm.SVC(), for example

    param = {"C": np.logspace(-2,2,5)}
    """
    #print(xM.shape, yVc.shape)
    kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True)
    gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs)
    gs.fit( xM, yVc)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, X_concat)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    kf_n = kf5_ext_c.split( xM)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    n_splits = xM.shape[0]

    # print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        jutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True):
    """
    Grid search method with numpy array of X and Y
    Previously, np.mat are used for compatible with Matlab notation.    
    """
    if disp:
        print( X.shape, Y.shape)

    clf = getattr( linear_model, method)()
    parmas = {'alpha': np.logspace( *alphas_log)}
    kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf5 = kf5_c.split( X)
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs)

    gs.fit( X, Y)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = svm.SVR( **svr_params)
    kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
    kf_n = kf5_ext_c.split( xM)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        kutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False):
    """
    gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1)

    Inputs
    ======
    model = svm.SVC(), or linear_model.LinearRegression(), for example
    param = {"C": np.logspace(-2,2,5)}
    """
    #print(xM.shape, yVc.shape)
    kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle)
    gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs)
    gs.fit( X, y)

    if graph:
        plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]')
        plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]')
        plt.legend(loc=0)
        plt.grid()

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, XX)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1):
    """
    As is a list of A matrices where A is similarity matrix. 
    X is a concatened linear descriptors. 
    If no X is used, X can be empty
    """

    clf = binary_model.BIKE_Ridge( A_list, X_concat)
    parmas = {'alpha': np.logspace( *alphas_log)}
    ln = A_list[0].shape[0] # ls is the number of molecules.

    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    #kf_n = kf5_ext_c.split( A_list[0])
    gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs)

    AX_idx = np.array([list(range( ln))]).T
    gs.fit( AX_idx, yV)

    return gs
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True)
    kf_n = kf5_ext_c.split( xM)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        kutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True):
    """
    method can be 'Ridge', 'Lasso'
    cross validation is performed so as to generate prediction output for all input molecules
    """ 
    n_splits = xM.shape[0]

    # print(xM.shape, yV.shape)

    clf = getattr( linear_model, method)( alpha = alpha)
    kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits)
    yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs)

    if graph:
        print('The prediction output using cross-validation is given by:')
        kutil.cv_show( yV, yV_pred, grid_std = grid_std)

    return yV_pred
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1):

    print(xM.shape, yV.shape)

    clf = linear_model.Lasso()
    #parmas = {'alpha': np.logspace(1, -1, 9)}
    parmas = {'alpha': np.logspace(*alphas_log)}
    kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True)
    kf5 = kf5_c.split(xM)

    gs = model_selection.GridSearchCV(
        clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs)

    gs.fit(xM, yV)

    return gs
项目:bnn-analysis    作者:myshkov    | 项目源码 | 文件源码
def create_training_test_sets(self):
        """ Split data set into training and test folds. """
        # load input data
        input_data = np.asarray(np.loadtxt('input/data.txt'), dtype=np.float32)
        self.input_dim = input_data.shape[1] - 1
        self.output_dim = 1

        # align to batch size
        batches = input_data.shape[0] // (self.batch_size * self.n_splits)
        input_data = input_data[:batches * (self.batch_size * self.n_splits)]

        self.data_size = input_data.shape[0]
        print(f'Loaded input data, shape = {input_data.shape}')

        # create splits
        kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
        print(f'Splits: {self.n_splits}')

        # assume y is in the last column by default
        for idx_train, idx_test in kfold.split(input_data):
            self.train_x.append(input_data[idx_train, :-1])
            self.train_y.append(input_data[idx_train, -1:])
            self.test_x.append(input_data[idx_test, :-1])
            self.test_y.append(input_data[idx_test, -1:])

        # layers described as [number of neurons, dropout probability]
        if self.layers_description is None:
            self.layers_description = [[self.input_dim, 0.0], [100, 0.0], [100, 0.0], [self.output_dim, 0.0]]
项目:hidi    作者:VEVO    | 项目源码 | 文件源码
def transform(self, M,  **kwargs):
        """
        Takes a Takes a dataframe that has :code:`item_id` index, other
        'features' columns for prediction, and applies a Keras sequential
        model to it.

        :param M:
            a dataframe that has an :code:`item_id` index, and
            "features" columns.

        :type M: pandas.DataFrame
        :rtype: a tuple with trained Keras model and its keyword
            arguments
        """
        rows, columns = M.shape
        factors = M.merge(self.validation_matrix, left_index=True,
                          right_index=True)
        factors = factors.values

        if self.classification:
            kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
                                    random_state=self.kfold_seed,
                                    shuffle=self.kfold_shuffle)
        else:
            kfold = KFold(n_splits=self.kfold_n_splits,
                          random_state=self.kfold_seed,
                          shuffle=self.kfold_shuffle)

        X = factors[:, :columns]
        Y = factors[:, columns:]
        for train_index, test_index in kfold.split(X, Y):
            self.keras_model.fit(
                X[train_index], Y[train_index],
                validation_data=[X[test_index], Y[train_index]],
                **self.keras_kwargs)

        return self.keras_model, kwargs
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def setup_data(self, path):
        """Read and iteratively yield data to agent"""
        print('loading: ' + path)

        questions = []
        y = []

        # open data file with labels
        # (path will be provided to setup_data from opt['datafile'] defined above)
        with open(path) as labels_file:
            context = csv.reader(labels_file)
            next(context)

            for item in context:
                label, text = item
                questions.append(text)
                y.append([self.answer_candidates[int(label)]])

        episode_done = True

        indexes = range(len(questions))
        if self.datatype_strict != 'test':
            random_state = random.getstate()
            random.setstate(self.random_state)
            kf_seed = random.randrange(500000)
            kf = KFold(self.opt.get('bagging_folds_number'), shuffle=True,
                       random_state=kf_seed)
            i = 0
            for train_index, test_index in kf.split(questions):
                indexes = train_index if self.datatype_strict == 'train' else test_index
                if i >= self.opt.get('bagging_fold_index', 0):
                    break
            self.random_state = random.getstate()
            random.setstate(random_state)

        # define iterator over all queries
        for i in indexes:
            # get current label, both as a digit and as a text
            # yield tuple with information and episode_done? flag
            yield (questions[i], y[i]), episode_done
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def kf_worker(X_tr, Y_tr, mu_range, tr_idx, vld_idx, i, results):
    """Worker for parallel KFold implementation."""
    betas = RLS_path(X_tr, Y_tr, mu_range)
    results[i] = {'betas': betas, 'tr_idx': tr_idx, 'vld_idx': vld_idx}
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def adaBoost(self, settings, data=None, dropna=True):
        df = self.__loadData(data, dropna)
        features = df.columns[:-1]
        X = df[features]
        y = df.iloc[:, -1].values
        seed = 7
        num_trees = 500
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        print kfold
        model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
        results = model_selection.cross_val_score(model, X, y, cv=kfold)
        model.fit(X, y)
        print results.mean()
        print model.score(X, y)
        return True
项目:marseille    作者:vene    | 项目源码 | 文件源码
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio):

    fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha,
                                       l1_ratio))

    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    ds = 'erule' if dataset == 'cdcp' else 'ukp-essays'  # sorry
    path = os.path.join("data", "process", ds, "folds", "{}", "{}")

    # sorry again: get val docs
    n_folds = 5 if dataset == 'ukp' else 3
    load, ids = get_dataset_loader(dataset, "train")
    for k_, (_, val) in enumerate(KFold(n_folds).split(ids)):
        if k_ == k:
            break
    val_docs = list(load(ids[val]))

    X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'),
                                    return_y=True)
    X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'),
                                    return_y=True)

    X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                    return_y=True)
    X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                    return_y=True)

    baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)
    baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)

    Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump((Y_marg, baseline), f)

    return Y_marg, baseline
项目:marseille    作者:vene    | 项目源码 | 文件源码
def linear_cv_score(dataset, alpha, l1_ratio, constraints):

    fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio,
                                         constraints))
    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")
    n_folds = 5 if dataset == 'ukp' else 3

    scores = []
    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio)

        val_docs = list(load(ids[val]))
        Y_true = [doc.label for doc in val_docs]
        Y_pred = bl.fast_decode(Y_marg, val_docs, constraints)

        scores.append(bl._score(Y_true, Y_pred))

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump(scores, f)
    return scores
项目:marseille    作者:vene    | 项目源码 | 文件源码
def svmstruct_cv_score(dataset, C, class_weight, constraints,
                       compat_features, second_order_features):

    fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight,
                                            constraints, compat_features,
                                            second_order_features))

    if os.path.exists(fn):
        logging.info("Cached file already exists.")
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")

    n_folds = 5 if dataset == 'ukp' else 3

    # below are boolean logical ops
    grandparents = second_order_features and dataset == 'ukp'
    coparents = second_order_features
    siblings = second_order_features and dataset == 'cdcp'

    scores = []
    all_Y_pred = []

    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        train_docs = list(load(ids[tr]))
        val_docs = list(load(ids[val]))

        clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C,
                                         class_weight,
                                         constraints, compat_features,
                                         second_order_features, grandparents,
                                         coparents, siblings)
        all_Y_pred.extend(Y_pred)
        scores.append(clf.model._score(Y_val, Y_pred))

    with open(fn, "wb") as f:
        dill.dump((scores, all_Y_pred), f)

    return scores, all_Y_pred
项目:MLClass    作者:bm2-lab    | 项目源码 | 文件源码
def split_kfold_r(y):
    skf = KFold(5)
    ilst = []
    for tri, tei in skf.split(y):
        ilst.append((tri, tei))
    return ilst
项目:Y8M    作者:mpekalski    | 项目源码 | 文件源码
def split_fold(in_pattern, rettrain=True, fold=0, cvs=5, include_vlaidation=True, split_seed=0):
    """
    Splits the elements of the in_pattern into training and test sets
    :param in_pattern: string of tfrecord patterns
    :param rettrain: return training set (True) or leave out set (False)
    :param fold: which fold to process
    :param cvs: how many folds you want
    :param include_vlaidation: include validation set
    :return: subset of tfrecords
    """
    assert fold < cvs

    files = gfile.Glob(in_pattern)
    if split_seed > 0:
        kf = KFold(n_splits=cvs, shuffle=True, random_state=split_seed)
    else:
        kf = KFold(n_splits=cvs)

    for i, (train, test) in enumerate(kf.split(files)):
        if i == fold:
            break

    if rettrain:
        retfiles = list(np.array(files)[train])
    else:
        retfiles = list(np.array(files)[test])

    if include_vlaidation:
        addition = [fname.replace('train', 'validate') for fname in retfiles]
        retfiles += addition

    return retfiles
项目:faceNet_RealTime    作者:jack55436001    | 项目源码 | 文件源码
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
项目:faceNet_RealTime    作者:jack55436001    | 项目源码 | 文件源码
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0

        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])

    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
项目:Guess-Genre-By-Lyrics    作者:ormatt    | 项目源码 | 文件源码
def test_using_kfold(X, y, clf, splits=5):
    kf = KFold(n_splits=splits, shuffle=True)

    scores = []
    for k, (train, test) in enumerate(kf.split(X, y)):
        logger.info("Fitting and transforming the model on one fold")
        clf.fit(X[train], y[train])
        score = clf.score(X[test], y[test])
        logger.info("[Fold {0}] score: {1:.5f}".format(k+1, score))
        scores.append(score)

    utils.persistence.dump(CLF_KFOLD_DUMP_NAME, clf)
    scores_mean = np.mean(scores)
    logger.info("Score: {}".format(scores_mean))
    return clf
项目:tf_face    作者:ZhijianChan    | 项目源码 | 文件源码
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, num_folds=10):
    """Calculate TPR and FPR under different threshold, accuracy under the best threshold"""
    assert (embeddings1.shape[0] == embeddings2.shape[0])
    assert (embeddings1.shape[1] == embeddings2.shape[1])
    num_pairs = min(len(actual_issame), embeddings1.shape[0])
    num_threshold = len(thresholds)
    k_fold = KFold(n_splits=num_folds, shuffle=False)

    tprs = np.zeros((num_folds, num_threshold))
    fprs = np.zeros((num_folds, num_threshold))
    acc = np.zeros((num_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff), 1)
    indices = np.arange(num_pairs)

    for fold_id, (train_set, test_set) in enumerate(k_fold.split(indices)):
        # Find the best threshold
        acc_train = np.zeros((num_threshold))
        for thres_id, thres in enumerate(thresholds):
            _, _, acc_train[thres_id] = calculate_acc(thres, dist[train_set], actual_issame[train_set])
        best_id = np.argmax(acc_train)
        # Calculate tprs and fprs on test set
        for thres_id, thres in enumerate(thresholds):
            tprs[fold_id, thres_id], fprs[fold_id, thres_id], _ = calculate_acc(thres, dist[test_set],
                                                                                actual_issame[test_set])
        # Use the best threshold to calculate accuracy
        _, _, acc[fold_id] = calculate_acc(thresholds[best_id], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs, 0)  # true  positive rate under different threshold
    fpr = np.mean(fprs, 0)  # false positive rate under different threshold
    return tpr, fpr, acc