Python sklearn.model_selection 模块,StratifiedKFold() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.StratifiedKFold()

项目:introspective    作者:numeristical    | 项目源码 | 文件源码
def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
    y_pred_xval = np.zeros(len(y_tr))
    skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
    i = 0;
    for train, test in skf:
        i = i+1
        print("training fold {} of {}".format(i, cv))
        X_train_xval = np.array(X_tr)[train,:]
        X_test_xval = np.array(X_tr)[test,:]
        y_train_xval = np.array(y_tr)[train]
        # We could also copy the model first and then fit it
        model_copy = clone(model)
        model_copy.fit(X_train_xval,y_train_xval)
        y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
    print("training full model")
    model_copy = clone(model)
    model_copy.fit(X_tr,y_tr)
    print("calibrating function")
    calib_func = prob_calibration_function(y_tr, y_pred_xval)
    return model_copy, calib_func
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def rf1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 300
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    for n, (itrain, ival) in enumerate(skf.split(train2, y)):
        print('step %d of %d'%(n+1, skf.n_splits), now())
        clf = ensemble.RandomForestRegressor(n_estimators=1000,
                                             max_depth=3,
                                             random_state=13)
        clf.fit(train2[itrain], y[itrain])

        p = clf.predict(train2[ival])
        v.loc[ival, cname] += p
        score = metrics.log_loss(y[ival], p)
        z[cname]  += np.log1p(clf.predict(test2))
        print(cname, 'step %d: score'%(n+1), score, now())
        scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def get_split(self):
        if self.split is not None:
            return
        name = "{}/split.p".format(self.flags.data_path)
        split = load_pickle(None,name,[])

        if len(split) == 0:
            #data = self.data["training_variants"].append(self.data["test_variants_filter"])
            data = self.data["training_variants"]
            y = data['Class']-1
            X = np.arange(y.shape[0])
            from sklearn.model_selection import StratifiedKFold
            skf = StratifiedKFold(n_splits=self.flags.folds,shuffle=True,random_state=99)
            split = [(train_index, test_index) for train_index, test_index in skf.split(X, y)]
            save_pickle(split,name)
            print("new shuffle")
        self.split = split
        #print("split va",split[0][1][:10])
项目:deep-mil-for-whole-mammogram-classification    作者:wentaozhu    | 项目源码 | 文件源码
def cvsplit(fold, totalfold, mydict):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test
      return train, test
    count += 1
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
        self.name = name
        self.X = X
        self.y = y
        self.task = task
        self.random_state = random_state
        if test_size is not None:
            self.test_size = test_size
            self.validation_method = "train_test_split"
            self.X_train, self.X_test, self.y_train, self.y_test = \
                model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        elif cv is not None:
            self.validation_method = "cv"
            if task == "regression":
                self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
            elif task == "classification":
                self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
项目:brainiak    作者:brainiak    | 项目源码 | 文件源码
def _sfn(l, mask, myrad, bcast_var):
    """Score classifier on searchlight data using cross-validation.

    The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
    number of cross-validation folds is in `bast_var[1].
    """
    clf = bcast_var[2]
    data = l[0][mask, :].T
    # print(l[0].shape, mask.shape, data.shape)
    skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
                                          shuffle=False)
    accuracy = np.mean(model_selection.cross_val_score(clf, data,
                                                       y=bcast_var[0],
                                                       cv=skf,
                                                       n_jobs=1))
    return accuracy
项目:brainiak    作者:brainiak    | 项目源码 | 文件源码
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
    # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
    # when the kernel matrix is computed in portions; also, this method only works
    # for self-correlation, i.e. correlation between the same data matrix.

    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_subjects,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
                                             y=labels,
                                             cv=skf)
    print(scores)
    logger.info(
        'the overall cross validation accuracy is %.2f' %
        np.mean(scores)
    )
项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def setBestParameters(self):
        cv = StratifiedKFold(n_splits = self.conf.num_folds)
        param_grid = self.conf.getParamGrid()
        if param_grid is None:
            # No parameter value to select
            return
        if self.conf.families_supervision:
            scoring = 'f1_macro'
        else:
            scoring = 'roc_auc'
        grid_search = GridSearchCV(self.pipeline, param_grid = param_grid,
                scoring = scoring,
                cv = cv,
                n_jobs = -1,
                fit_params = {'model__sample_weight': self.datasets.sample_weight})
        grid_search.fit(self.datasets.train_instances.getFeatures(),
                self.getSupervision(self.datasets.train_instances))
        self.conf.setBestValues(grid_search)
        self.pipeline.set_params(**self.conf.getBestValues())
        return cv
项目:scikit-mdr    作者:EpistasisLab    | 项目源码 | 文件源码
def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.
项目:scikit-mdr    作者:EpistasisLab    | 项目源码 | 文件源码
def test_mdr_sklearn_pipeline_parallel():
    """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
    assert np.mean(cv_scores) > 0.
项目:heamy    作者:rushter    | 项目源码 | 文件源码
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index
项目:flexmatcher    作者:biggorilla-gh    | 项目源码 | 文件源码
def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        partial_clf = linear_model.LogisticRegression(class_weight='balanced')
        prediction = np.zeros((len(self.features), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.features, self.labels):
            # prepare the training and test data
            training_features = self.features[train_index]
            test_features = self.features[test_index]
            training_labels = self.labels[train_index]
            # fitting the model and predicting
            partial_clf.fit(training_features, training_labels)
            curr_pred = partial_clf.predict_proba(test_features)
            prediction[test_index] = \
                self.predict_proba_ordered(curr_pred, partial_clf.classes_)
        return prediction
项目:flexmatcher    作者:biggorilla-gh    | 项目源码 | 文件源码
def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        partial_clf = linear_model.LogisticRegression(class_weight='balanced')
        prediction = np.zeros((len(self.features), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.features, self.labels):
            # prepare the training and test data
            training_features = self.features[train_index]
            test_features = self.features[test_index]
            training_labels = self.labels[train_index]
            # fitting the model and predicting
            partial_clf.fit(training_features, training_labels)
            curr_pred = partial_clf.predict_proba(test_features)
            prediction[test_index] = \
                self.predict_proba_ordered(curr_pred, partial_clf.classes_)
        return prediction
项目:flexmatcher    作者:biggorilla-gh    | 项目源码 | 文件源码
def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        prediction = np.zeros((len(self.strings), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.strings, self.labels):
            # prepare the training and test data
            training_strings = self.strings[train_index]
            test_strings = self.strings[test_index]
            training_labels = self.labels[train_index]
            # predicting the results
            part_prediction = self.find_knn(training_strings, training_labels,
                                            test_strings)
            prediction[test_index] = part_prediction
        return prediction
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves class ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    X = np.ones(n_samples)
    y = np.array([4] * int(0.10 * n_samples) +
                 [0] * int(0.89 * n_samples) +
                 [1] * int(0.01 * n_samples))

    for shuffle in (False, True):
        for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
            assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
            assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
            assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
            assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    X = np.ones(17)
    y = [0] * 3 + [1] * 14

    for shuffle in (True, False):
        cv = StratifiedKFold(3, shuffle=shuffle)
        for i in range(11, 17):
            skf = cv.split(X[:i], y[:i])
            sizes = []
            for _, test in skf:
                sizes.append(len(test))

            assert_true((np.max(sizes) - np.min(sizes)) <= 1)
            assert_equal(np.sum(sizes), i)
项目:hidi    作者:VEVO    | 项目源码 | 文件源码
def transform(self, M,  **kwargs):
        """
        Takes a Takes a dataframe that has :code:`item_id` index, other
        'features' columns for prediction, and applies a Keras sequential
        model to it.

        :param M:
            a dataframe that has an :code:`item_id` index, and
            "features" columns.

        :type M: pandas.DataFrame
        :rtype: a tuple with trained Keras model and its keyword
            arguments
        """
        rows, columns = M.shape
        factors = M.merge(self.validation_matrix, left_index=True,
                          right_index=True)
        factors = factors.values

        if self.classification:
            kfold = StratifiedKFold(n_splits=self.kfold_n_splits,
                                    random_state=self.kfold_seed,
                                    shuffle=self.kfold_shuffle)
        else:
            kfold = KFold(n_splits=self.kfold_n_splits,
                          random_state=self.kfold_seed,
                          shuffle=self.kfold_shuffle)

        X = factors[:, :columns]
        Y = factors[:, columns:]
        for train_index, test_index in kfold.split(X, Y):
            self.keras_model.fit(
                X[train_index], Y[train_index],
                validation_data=[X[test_index], Y[train_index]],
                **self.keras_kwargs)

        return self.keras_model, kwargs
项目:deep-mil-for-whole-mammogram-classification    作者:wentaozhu    | 项目源码 | 文件源码
def cvsplitenhance(fold, totalfold, mydict, valfold=-1):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel
  sperate the data into train, validation, test'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  if valfold == -1: 
    valfold = (fold+1) % totalfold
  print('valfold'+str(valfold))
  trainls, valls, testls = [], [], []
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test[:]
      testls = test[:]
    elif count == valfold:
      valls = test[:]
    else:
      for i in test:
        trainls.append(i)
    count += 1
  return trainls, valls, testls
项目:website-fingerprinting    作者:AxelGoetz    | 项目源码 | 文件源码
def k_fold_validation(model, monitored_data, unmonitored_data, k, random_state=123):
    """
    Performs k fold validation on a model. During each fold, records all of the scoring in the `scoring_methods` module.

    @param model is a machine learning model that has the functions `fit(X, y)` and `predict(X)`
    @param monitored_data an array-like matrix that has the following structure `[(features, value)]`
    @param unmonitored_data is also an array-like object: [features]
    @param k is the amount of folds

    @return is a 2D array of scores, with the following structure `[{scoring_method: score}]` where the shape is `len(k)`
    """
    X, y = get_X_y(monitored_data, unmonitored_data)
    skf = StratifiedKFold(n_splits=k, random_state=random_state, shuffle=True)

    evaluations = []
    i = 1
    for train, test in skf.split(X, y):
        print("Starting split {}".format(i))
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]

        print("Fitting data")
        model.fit(X_train, y_train)

        print("Predicting")
        prediction = model.predict(X_test)

        evaluations.append(scoring_methods.evaluate_model(prediction, y_test))

        print(evaluations[-1])

        i += 1

    return evaluations
项目:brainiak    作者:brainiak    | 项目源码 | 文件源码
def _cross_validation_for_one_voxel(clf, vid, num_folds, subject_data, labels):
    """Score classifier on data using cross validation."""
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_folds,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, subject_data,
                                             y=labels,
                                             cv=skf, n_jobs=1)
    logger.debug(
        'cross validation for voxel %d is done' %
        vid
    )
    return (vid, scores.mean())
项目:MLClass    作者:bm2-lab    | 项目源码 | 文件源码
def split_kfold_c(y):
    skf = StratifiedKFold(5)
    ilst = []
    for tri, tei in skf.split(np.zeros(len(y)), y):
        ilst.append((tri, tei))
    return ilst
项目:pydl    作者:rafaeltg    | 项目源码 | 文件源码
def get_cv_method(method, **kwargs):

    if method == 'kfold':
        return KFold(**kwargs)
    elif method == 'skfold':
        return StratifiedKFold(**kwargs)
    elif method == 'loo':
        return LeaveOneOut()
    elif method == 'shuffle_split':
        return ShuffleSplit(**kwargs)
    elif method == 'split':
        return TrainTestSplit(**kwargs)
    elif method == 's_shuffle_split':
        return StratifiedShuffleSplit(**kwargs)
    elif method == 'time_series':
        return TimeSeriesSplit(**kwargs)
    else:
        raise AttributeError('Invalid CV method - %s!' % method)
项目:DistributedClassifier    作者:rsboos    | 项目源码 | 文件源码
def computeAccuracyForSingleModel(self,algorithm="SVM",isLocalSmall=0,execType="normal"):
        totalFeatures = self.instancesFeatures.shape[1]
        n = min(5, totalFeatures/2) # as explained in the article, the number of local agents will be 5
        numberOfFeaturesInEachModel = int( math.ceil (totalFeatures / n) )
        if (isLocalSmall):
            instFeatures = dataPreparation.selectNRandomColumns(self.instancesFeatures,numberOfFeaturesInEachModel)
            #select random numberOfFeatures columns
        else:
            instFeatures = np.array(self.instancesFeatures)

        skf = StratifiedKFold(n_splits=self.kFolds)
        avgScore = 0
        avgF1Macro = 0
        avgF1Micro = 0
        avgF1Weighted = 0
        for train_index, test_index in skf.split(instFeatures, self.instancesClasses):
            resultClasses = classifier.MakeClassification(self.algorithmsIndex[algorithm],instFeatures[train_index],self.instancesClasses[train_index],instFeatures[test_index],"value")
            valF1Macro = f1_score(self.instancesClasses[test_index], resultClasses, average='macro')
            valF1Micro = f1_score(self.instancesClasses[test_index], resultClasses, average='micro')
            valF1Weighted = f1_score(self.instancesClasses[test_index], resultClasses, average='weighted')
            valScore = accuracy_score(self.instancesClasses[test_index],resultClasses)
            avgF1Macro += valF1Macro
            avgF1Micro += valF1Micro
            avgF1Weighted += valF1Weighted
            avgScore += valScore
            with open(self.fileToWrite, "a") as myfile:
                myfile.write(str(valF1Weighted)+"\t"+str(valF1Micro)+"\t"+str(valF1Macro)+"\t"+str(valScore)+"\n")
        avgScore = avgScore / self.kFolds
        avgF1Macro /= self.kFolds
        avgF1Weighted /= self.kFolds
        avgF1Micro /= self.kFolds
        return avgScore, avgF1Macro, avgF1Micro, avgF1Weighted

    # this function will call all the underlying methods in order to perform data prepation, classification in each simulated agent, and aggregation
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def plot_significance_score(model, X, Y):
    from sklearn.model_selection import permutation_test_score, StratifiedKFold
    cv = StratifiedKFold(10)
    # Must be numpy arrays to use permutation as opposed to pandas data frame
    score, permutation_scores, pvalue = permutation_test_score(model,
                X.values, Y.values, scoring="roc_auc", cv=cv,
                n_permutations=100)
    print("Classification Score %s (p-value: %s)" % (score, pvalue))

# -------------------- Matplotlib Graphs -------------------- #
#-------------------------------------------------------------#
#-------------------------------------------------------------#
项目:xcessiv    作者:reiinakano    | 项目源码 | 文件源码
def setUp(self):
        bl1 = RandomForestClassifier(random_state=8)
        bl2 = LogisticRegression()
        bl3 = RandomForestClassifier(max_depth=10, random_state=10)

        meta_est = LogisticRegression()

        skf = StratifiedKFold(random_state=8).split

        self.stacked_ensemble = stacker.XcessivStackedEnsemble(
            [bl1, bl2, bl3],
            ['predict', 'predict_proba', 'predict_proba'],
            meta_est,
            skf
        )
项目:xcessiv    作者:reiinakano    | 项目源码 | 文件源码
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits
项目:Sentence-Classification-with-RNN-in-TensorFlow    作者:zakizhou    | 项目源码 | 文件源码
def build_corpus():
    positive_sentences = codecs.open("rt-polaritydata/rt-polarity.pos").readlines()
    negative_sentences = codecs.open("rt-polaritydata/rt-polarity.neg").readlines()
    num_positive = len(positive_sentences)
    num_negative = len(negative_sentences)
    labels = [1] * num_positive + [0] * num_negative
    sentences = positive_sentences + negative_sentences
    clean = [word_tokenize(clean_sentence(sentence)) for sentence in sentences]
    total = reduce(lambda sent1, sent2: sent1 + sent2, clean)
    counter = collections.Counter(total)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word2id = dict(zip(words, range(3, len(words)+3)))
    word2id["<pad>"] = 0
    word2id["<sos>"] = 1
    word2id["<eos>"] = 2
    inputs = []
    for sent in clean:
        stantard_sent = [1] + [word2id[word] for word in sent] + [2]
        inputs.append(stantard_sent)
    skf = StratifiedKFold(n_splits=5)
    inputs_array = np.array(inputs)
    labels_array = np.array(labels)
    train_index, validation_index = skf.split(inputs_array, labels_array).next()
    np.random.shuffle(train_index)
    np.random.shuffle(validation_index)
    train_X, train_y = inputs_array[train_index], labels_array[train_index]
    valid_X, valid_y = inputs_array[validation_index], labels_array[validation_index]
    return word2id, train_X, train_y, valid_X, valid_y
项目:bayesianpy    作者:morganics    | 项目源码 | 文件源码
def _get_cv_splits(self, df):
        if self._cv_method is None:
            self._cv_method = StratifiedKFold(n_splits=self._kfolds, shuffle=self._shuffle)

        for train, test in self._cv_method.split(df, df[self._class_col]):
            yield df.ix[train], df.ix[test]
项目:HASY    作者:MartinThoma    | 项目源码 | 文件源码
def _create_stratified_split(csv_filepath, n_splits):
    """
    Create a stratified split for the classification task.

    Parameters
    ----------
    csv_filepath : str
        Path to a CSV file which points to images
    n_splits : int
        Number of splits to make
    """
    from sklearn.model_selection import StratifiedKFold
    data = _load_csv(csv_filepath)
    labels = [el['symbol_id'] for el in data]
    skf = StratifiedKFold(labels, n_folds=n_splits)
    i = 1
    kdirectory = 'classification-task'
    if not os.path.exists(kdirectory):
            os.makedirs(kdirectory)
    for train_index, test_index in skf:
        print("Create fold %i" % i)
        directory = "%s/fold-%i" % (kdirectory, i)
        if not os.path.exists(directory):
            os.makedirs(directory)
        else:
            print("Directory '%s' already exists. Please remove it." %
                  directory)
        i += 1
        train = [data[el] for el in train_index]
        test_ = [data[el] for el in test_index]
        for dataset, name in [(train, 'train'), (test_, 'test')]:
            with open("%s/%s.csv" % (directory, name), 'wb') as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
                for el in dataset:
                    csv_writer.writerow(("../../%s" % el['path'],
                                         el['symbol_id'],
                                         el['latex'],
                                         el['user_id']))
项目:HASY    作者:MartinThoma    | 项目源码 | 文件源码
def _create_stratified_split(csv_filepath, n_splits):
    """
    Create a stratified split for the classification task.

    Parameters
    ----------
    csv_filepath : str
        Path to a CSV file which points to images
    n_splits : int
        Number of splits to make
    """
    from sklearn.model_selection import StratifiedKFold
    data = _load_csv(csv_filepath)
    labels = [el['symbol_id'] for el in data]
    skf = StratifiedKFold(labels, n_folds=n_splits)
    i = 1
    kdirectory = 'classification-task'
    if not os.path.exists(kdirectory):
            os.makedirs(kdirectory)
    for train_index, test_index in skf:
        print("Create fold %i" % i)
        directory = "%s/fold-%i" % (kdirectory, i)
        if not os.path.exists(directory):
            os.makedirs(directory)
        else:
            print("Directory '%s' already exists. Please remove it." %
                  directory)
        i += 1
        train = [data[el] for el in train_index]
        test_ = [data[el] for el in test_index]
        for dataset, name in [(train, 'train'), (test_, 'test')]:
            with open("%s/%s.csv" % (directory, name), 'wb') as csv_file:
                csv_writer = csv.writer(csv_file)
                csv_writer.writerow(('path', 'symbol_id', 'latex', 'user_id'))
                for el in dataset:
                    csv_writer.writerow(("../../%s" % el['path'],
                                         el['symbol_id'],
                                         el['latex'],
                                         el['user_id']))
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    np.random.seed(seed)
    feature_index = np.arange(X.shape[1])
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
        mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)

        np.random.shuffle(feature_index)
        new_index = feature_index[:int(max_features*len(feature_index))]

        for ind_tr, ind_te in skf:
            if ssp.issparse(X):
                X_tr = X[ind_tr].tocsc()[:,new_index]
                X_te = X[ind_te].tocsc()[:,new_index]
            else:
                X_tr = X[ind_tr][:,new_index]
                X_te = X[ind_te][:,new_index]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict_proba(X_te)
            mf_te += clf.predict_proba(X_test[:,new_index])
            score = log_loss(y_te, mf_tr[ind_te])
            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random_r.pkl'%(name,epoch))
        pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random_r.pkl'%(name,epoch))
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_lsvc_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros(X.shape[0])
        mf_te = np.zeros(X_test.shape[0])
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)


        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict_proba(X_te).ravel()
            score = accuracy_score(y_te, clf.predict(X_te).ravel())
            del X_tr
            del X_te

            mf_te += clf.predict_proba(X_test).ravel()

            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr.reshape(-1,1),path+'X_mf_%s_%s_random.pkl'%(name,epoch))
        pd.to_pickle(mf_te.reshape(-1,1),path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_regression(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print clf
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros(X.shape[0])
        mf_te = np.zeros(X_test.shape[0])
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)


        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict(X_te)
            del X_tr
            del X_te

            l = 600000
            y_pred = []
            for batch in range(4):
                X_tmp = X_test[l*batch:l*(batch+1)]
                y_pred.append(clf.predict(X_tmp))
            y_pred = np.concatenate(y_pred)
            mf_te += y_pred
            score = log_loss(y_te, mf_tr[ind_te])
            print '\tpred[{}] score:{}'.format(epoch, score)
        mf_te/=n_folds
        pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch))
        pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
项目:xgboost-tuner    作者:cwerner87    | 项目源码 | 文件源码
def tune_num_estimators(metric: str,
                        label: np.ndarray,
                        params: dict,
                        strat_folds: StratifiedKFold,
                        train) -> Tuple[int, float]:
    """
    Uses xgboost's cross-validation method to tune the number of estimators and returns that along with the best CV score
    achieved.

    :param metric:
        Evaluation metric that is monitored during cross-validation - e.g. 'logloss' or 'rmse'.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A StratifiedKFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :return:
        A tuple containing the tuned number of estimators along with the best CV score achieved.
    """
    eval_hist = xgb.cv(
        dtrain=xgb.DMatrix(train, label=label),
        early_stopping_rounds=50,
        folds=strat_folds,
        metrics=metric,
        num_boost_round=10000,
        params=params,
        verbose_eval=True
    )
    num_trees = eval_hist.shape[0]
    best_score = eval_hist.values[num_trees - 1, 0]
    return num_trees, best_score
项目:anomalous-vertices-detection    作者:Kagandi    | 项目源码 | 文件源码
def split_kfold(self, features, labels=None, n_folds=10):
        skf = StratifiedKFold(n_folds)
        for train_index, test_index in skf.split(features, labels):
            yield train_index, test_index
            # return cross_validation.StratifiedKFold(labels, n_folds)
项目:xam    作者:MaxHalford    | 项目源码 | 文件源码
def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3),
                 use_base_features=True, use_proba=True):
        super().__init__(
            models=models,
            meta_model=meta_model,
            cv=cv,
            use_base_features=use_base_features,
            use_proba=use_proba,
        )
项目:b4msa    作者:INGEOTEC    | 项目源码 | 文件源码
def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={},
                      kfolds=None, pool=None, use_tqdm=True):
        try:
            from tqdm import tqdm
        except ImportError:
            def tqdm(x, **kwargs):
                return x

        le = preprocessing.LabelEncoder().fit(y)
        y = np.array(le.transform(y))
        hy = np.zeros(len(y), dtype=np.int)
        if kfolds is None:
            kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True,
                                     random_state=seed).split(X, y)
        args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds]
        if pool is not None:
            if use_tqdm:
                res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args),
                                       desc='Params', total=len(args))]
            else:
                res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)]
        else:
            if use_tqdm:
                args = tqdm(args)
            res = [cls.train_predict_pool(x) for x in args]
        for ts, _hy in res:
            hy[ts] = _hy
        return le.inverse_transform(hy)
项目:b4msa    作者:INGEOTEC    | 项目源码 | 文件源码
def __init__(self, X, y, score, n_folds, cls, seed=0, pool=None):
        self.n_folds = n_folds
        self.score = score
        self.X = X
        self.le = le = preprocessing.LabelEncoder().fit(y)
        self.y = np.array(le.transform(y))
        self.cls = cls
        self.pool = pool
        np.random.seed(seed)
        self.kfolds = [x for x in StratifiedKFold(n_splits=n_folds, shuffle=True,
                                                  random_state=seed).split(np.zeros(self.y.shape[0]),
                                                                           self.y)]
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 7
    N_seeds = 2
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.03,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            print('step %d of %d'%(n+1, skf.n_splits), now())
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            dtest = xgb.DMatrix(test2)
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
            scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb2(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 7
    N_seeds = 2
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 4,
            learning_rate = 0.03,
            subsample = 0.7,
            #colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            print('step %d of %d'%(n+1, skf.n_splits), now())
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
            scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb3(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 7
    N_seeds = 2
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 4,
            learning_rate = 0.03,
            subsample = 0.8,
            #colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            print('step %d of %d'%(n+1, skf.n_splits), now())
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
            scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 7
    N_seeds = 3
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 5,
            learning_rate = 0.02,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            dtest = xgb.DMatrix(test2)
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb3(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 7
    N_seeds = 3
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 4,
            learning_rate = 0.02,
            subsample = 0.8,
            colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds