我们从Python开源项目中,提取了以下25个代码示例,用于说明如何使用sklearn.cross_validation.StratifiedShuffleSplit()。
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) x, y = online.collect_pts(100, -1) i = 0 q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) while q < 3500: i += 1 # h_ = ex.fit(x, y) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_, _ = online_.collect_pts(10, 200) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(oracle(x_)) q += online_.get_n_query() pred_y = h_.predict(test_x) print len(x), q, sm.accuracy_score(test_y, pred_y)
def grid_retrain_in_f(self, n_dim=500): rbf_map = RBFSampler(n_dim, random_state=1) fourier_approx_svm = pipeline.Pipeline([("mapper", rbf_map), ("svm", LinearSVC())]) # C_range = np.logspace(-5, 15, 21, base=2) # gamma_range = np.logspace(-15, 3, 19, base=2) # param_grid = dict(mapper__gamma=gamma_range, svm__C=C_range) # cv = StratifiedShuffleSplit(Y, n_iter=5, test_size=0.2, random_state=42) # grid = GridSearchCV(fourier_approx_svm, param_grid=param_grid, cv=cv) # grid.fit(X, Y) # # rbf_svc2 = grid.best_estimator_ rbf_svc2 = fourier_approx_svm rbf_svc2.fit(self.X_ex, self.y_ex) self.set_clf2(rbf_svc2) return self.benchmark()
def grid_search(self): C_range = np.logspace(-5, 15, 21, base=2) param_grid = dict(C=C_range) cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0) logger.info('start grid search for Linear') grid.fit(self.X_ex, self.y_ex) logger.info('end grid search for Linear') scores = [x[1] for x in grid.grid_scores_] # final train clf = grid.best_estimator_ pred_train = clf.predict(self.X_ex) pred_val = clf.predict(self.val_x) pred_test = clf.predict(self.test_x) r = Result(self.name + ' (X)', 'Poly', len(self.X_ex), sm.accuracy_score(self.y_ex, pred_train), sm.accuracy_score(self.val_y, pred_val), sm.accuracy_score(self.test_y, pred_test)) return r
def test_stratified_shuffle_split_init(): y = np.asarray([0, 1, 1, 1, 2, 2, 2]) # Check that error is raised if there is a class with only one sample assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2) # Check that error is raised if the test set size is smaller than n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2) # Check that error is raised if the train set size is smaller than # n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2) y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8) # Train size or test size too small assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50) ] for y in ys: sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, random_state=0) for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(y[train].size + y[test].size, y.size) assert_array_equal(np.intersect1d(train, test), [])
def split_indices_old(files, labels, test_size=0.1, random_state=RANDOM_STATE): names = get_names(files) labels = get_labels(names, per_patient=True) spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], test_size=test_size, random_state=random_state, n_iter=1) tr, te = next(iter(spl)) tr = np.hstack([tr * 2, tr * 2 + 1]) te = np.hstack([te * 2, te * 2 + 1]) return tr, te
def split_indices(files, labels, label_file, test_size=0.1, random_state=RANDOM_STATE): # <-- Necessary for running with training on melanoma database, not using per_patient names = get_names(files) labels = get_labels(names, label_file=label_file, per_patient=False) spl = cross_validation.StratifiedShuffleSplit(labels, test_size=test_size, random_state=random_state, n_iter=1) tr, te = next(iter(spl)) return tr, te
def hot(X, y): C_range = np.logspace(-15, 15, 31,base = 2.0) gamma_range = np.logspace(-15, 15, 31, base = 2.0) # param_grid = dict(gamma=gamma_range, C=C_range) # cv = StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=42) roc_auc_scorer = get_scorer("roc_auc") scores = [] for C in C_range: for gamma in gamma_range: auc_scorer = [] for train, test in KFold(n=len(X), n_folds=10, random_state=42): rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma, probability=True) X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] rbf_clf = rbf_svc.fit(X_train, y_train) auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test)) scores.append(np.mean(auc_scorer)) # grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) # grid.fit(X, y) # scores = [x[1] for x in grid.grid_scores_] scores = np.array(scores).reshape(len(C_range), len(gamma_range)) print scores plt.figure(figsize=(15, 12)) plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, norm=MidpointNormalize(vmin=0.2, midpoint=0.92)) plt.xlabel('gamma') plt.ylabel('C') plt.colorbar() plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=90) plt.yticks(np.arange(len(C_range)), C_range) plt.title('AUC') plt.show()
def _train_val_split_indices(labels): split = StratifiedShuffleSplit( labels, n_iter=1, test_size=VAL_SIZE, random_state=42) indices_tr, indices_val = next(iter(split)) _save_organized_data_info( split.classes, indices_tr, indices_val, multi_crop=False) _save_organized_data_info( split.classes, indices_tr, indices_val, multi_crop=True) return indices_tr, indices_val, split.classes
def cross_predict(feat, f_name, X=X, y=y): if os.name == 'nt': n_jobs = 1 else: n_jobs = -1 # ???? # clf_1 = MultinomialNB(alpha=5) clf_2 = LinearSVC(C=0.02) # ???? (CV) # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, # which returns stratified randomized folds. The folds are made by preserving # the percentage of samples for each class. # # Note: like the ShuffleSplit strategy, stratified random splits do not guarantee # that all folds will be different, although this is still # very likely for sizeable datasets. # # Pass this cv to cross_val_predict will raise # ValueError:cross_val_predict only works for partitions # # ? cv ?????? fold ? fold ???????? # cv = cross_validation.StratifiedShuffleSplit(y, test_size=0.2, random_state=42) # This cross-validation object is a variation of KFold that returns stratified folds. # The folds are made by preserving the percentage of samples for each class. cv = cross_validation.StratifiedKFold(y, n_folds=5, random_state=42) model = Pipeline([('feat', feat), ('clf', clf_2)]) t0 = time() y_pred = cross_validation.cross_val_predict(model, X=X, y=y, n_jobs=n_jobs, cv=cv) t = time() - t0 print("=" * 20, f_name, "=" * 20) print("time cost: {}".format(t)) # print("y_predict: {}".format(y_pred)) print() print('confusion matrix:\n', confusion_matrix(y, y_pred)) print() print('\t\taccuracy: {}'.format(accuracy_score(y, y_pred))) print() print("\t\tclassification report") print("-" * 52) print(classification_report(y, y_pred)) # ?? # ???? (tfidf: baseline feature)
def do(self, n_pts): X, y = self.collect_pts(n_pts) print 'done collecting points' rbf_map = RBFSampler(n_components=n_pts, random_state=1) solver = HyperSolver(p=self.POS, n=self.NEG) rbf_solver = pipeline.Pipeline([("mapper", rbf_map), ("solver", solver)]) gamma_range = np.logspace(-15, 6, 22, base=2) param_grid = dict(mapper__gamma=gamma_range) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1) grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8) grid.fit(X, y) scores = [x[1] for x in grid.grid_scores_] scores = np.array(scores).reshape(len(gamma_range)) plt.figure(figsize=(8, 6)) plt.plot(gamma_range, scores) plt.xlabel('gamma') plt.ylabel('score') plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name)) plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts) # final train g = grid.best_params_['mapper__gamma'] print 'best parameters are g=%f' % g rbf_svc2 = grid.best_estimator_ y_pred = rbf_svc2.predict(self.Xt) print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred) return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
def grid_search(self): C_range = np.logspace(-5, 15, 21, base=2) param_grid = dict(C=C_range) cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0) logger.info('start grid search for Linear') grid.fit(self.X_ex, self.y_ex) logger.info('end grid search for Linear') scores = [x[1] for x in grid.grid_scores_] # final train rbf_svc2 = grid.best_estimator_ pred_train = rbf_svc2.predict(self.X_ex) pred_val = rbf_svc2.predict(self.val_x) pred_test = rbf_svc2.predict(self.test_x) r = Result(self.name + ' (X)', 'Linear', len(self.X_ex), sm.accuracy_score(self.y_ex, pred_train), sm.accuracy_score(self.val_y, pred_val), sm.accuracy_score(self.test_y, pred_test)) return r
def balancedSplit(X, y, seed, test_sz=1000): stratSplit = StratifiedShuffleSplit( y, 1, test_size=test_sz, random_state=seed ) for train_idx, test_idx in stratSplit: X_train = X[train_idx] y_train = y[train_idx] X_test = X[test_idx] y_test = y[test_idx] break return X_train, y_train, X_test, y_test
def getBalancedSample(y, seed, test_sz=1000): if y.shape[0] == test_sz: return np.arange(test_sz) else: stratSplit = StratifiedShuffleSplit( y, 1, test_size=test_sz, random_state=seed ) for _, test_idx in stratSplit: idx = test_idx break return idx
def get_data(): lmdb_env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_train_lmdb//') lmdb_txn = lmdb_env.begin() lmdb_cursor = lmdb_txn.cursor() datum = caffe.proto.caffe_pb2.Datum() x=[] y=[] for key, value in lmdb_cursor: datum.ParseFromString(value) label = datum.label data = caffe.io.datum_to_array(datum) x.append(data) y.append(label) x=np.array(x) y=np.array(y) map_size = int(1e12) # Don't need to shuffle 3 times sss = StratifiedShuffleSplit(y, 3, test_size=0.2, random_state=0) for train_index, test_index in sss: ind_train=train_index ind_test=test_index env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_evenval_lmdb2/', map_size=map_size) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(10000): im_dat = caffe.io.array_to_datum(x[ind_test][i],y[ind_test][i]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString()) #map_size = x.nbytes * 10 del env env = lmdb.open('/home/lisha/school/caffe/examples/cifar10/cifar10_eventrain_lmdb2/', map_size=map_size) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(40000): im_dat = caffe.io.array_to_datum(x[ind_train][i],y[ind_train][i]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
def make_train_val(): print 'Loading Matlab data.' f = '/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mnist_rotation_back_image_new/mnist_all_background_images_rotation_normalized_train_valid.amat' X,Y=get_data(f) N = Y.shape[0] map_size = X.nbytes*2 #if you want to shuffle your data #random.shuffle(N) sss = StratifiedShuffleSplit(Y, 3, test_size=2000, random_state=0) for train_index, test_index in sss: ind_train1=train_index ind_val1=test_index print len(ind_train1),len(ind_val1) env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_train', map_size=map_size*5/6) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(len(ind_train1)): im_dat = caffe.io.array_to_datum(X[ind_train1[i]],Y[ind_train1[i]]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString()) env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/mrbi/mrbi_val', map_size=map_size/6) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(len(ind_val1)): im_dat = caffe.io.array_to_datum(X[ind_val1[i]],Y[ind_val1[i]]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString())
def train_test_split_shuffle(target, features, test_size = 0.1): sss = StratifiedShuffleSplit(target, 1, test_size = test_size, random_state=0) for train_index, test_index in sss: X_train, X_test = features[train_index], features[test_index] y_train, y_test = target[train_index], target[test_index] y_test = y_test.values y_train = y_train.values return X_train, y_train, X_test, y_test
def test_stratified_shuffle_split_overlap_train_test_bug(): # See https://github.com/scikit-learn/scikit-learn/issues/6121 for # the original bug report labels = [0, 1, 2, 3] * 3 + [4, 5] * 5 splits = cval.StratifiedShuffleSplit(labels, n_iter=1, test_size=0.5, random_state=0) train, test = next(iter(splits)) assert_array_equal(np.intersect1d(train, test), [])
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) x, y = online.collect_pts(100, -1) i = 0 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ while q < 3500: i += 1 # h_ = ex.fit(x, y) online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_ = online_.collect_one_pair() if x_ is not None and len(x_) > 0: for _x in x_: x.append(_x) y.append(1) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h1 = grid.best_estimator_ s1 = sm.accuracy_score(y, h1.predict(x)) y[-1] = -1 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h2 = grid.best_estimator_ s2 = sm.accuracy_score(y, h2.predict(x)) if s1 >= .99 and s2 >= .99: print 'branch 1' y[-1] = oracle(x_)[0] elif s1 >= .99 and s2 < .99: print 'branch 2' y[-1] = 1 elif s1 < .99 and s2 >= .99: print 'branch 3' y[-1] = -1 else: print 'branch 4: ', s1, s2 del x[-1] del y[-1] continue if y[-1] == 1: h_ = h1 else: h_ = h2 q += online_.get_n_query() pred_y = h_.predict(test_x) print q, sm.accuracy_score(test_y, pred_y)
def do(self): # get some initial points self.ex.collect_up_to_budget(self.budget_per_round * 2) x, y = self.ex.pts_near_b, self.ex.pts_near_b_labels if len(np.unique(y)) < 2: return 1, 1 # gamma_range = np.logspace(-5, 1, 10, base=10) # param_grid = dict(gamma=gamma_range) try: # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2) # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1) # grid.fit(x, y) # h_best = grid.best_estimator_ raise ValueError except ValueError: h_best = svm.SVC(C=1e5) h_best.fit(x, y) for i in range(1, self.n_rounds - 1): online_ = OnlineBase('', +1, self.NEG, h_best.predict, self.n_features, 'uniform', error=.1) x_, _ = online_.collect_pts(self.budget_per_round, 50000) # budget doesn't matter xx_ = None if x_ is None or len(x_) < self.budget_per_round: print('Run out of budget when getting x_') xx_ = np.random.uniform(-1, 1, (self.budget_per_round - len(x_), self.n_features)) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(self.oracle(x_)) if xx_ is not None: x.extend(xx_) y.extend(self.oracle(xx_)) try: # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2) # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1) # grid.fit(x, y) # h_best = grid.best_estimator_ raise ValueError except ValueError: h_best = svm.SVC(C=1e5) h_best.fit(x, y) # h_best.fit(x, y) self.set_clf2(h_best) return self.benchmark() # (ex.batch_predict, h_.predict, test_x, n_features)
def make_train_val(): print 'Loading Matlab data.' f1 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/train_32x32.mat') f2 = scipy.io.loadmat('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_data/extra_32x32.mat') # name of your matlab variables: data_train = f1.get('X') labels_train = f1.get('y') data_extra=f2.get('X') labels_extra = f2.get('y') sss = StratifiedShuffleSplit(labels_train, 3, test_size=0.05460229056, random_state=0) for train_index, test_index in sss: ind_train1=train_index ind_val1=test_index sss = StratifiedShuffleSplit(labels_extra, 3, test_size=0.00376554936, random_state=1) for train_index, test_index in sss: ind_train2=train_index ind_val2=test_index print 'val: '+str(len(ind_val1)+len(ind_val2))+' train: '+str(len(ind_train1)+len(ind_train2)) Y1= np.array(labels_train,dtype=int) Y1[Y1==10]=0 Y1=Y1.flatten() Y2= np.array(labels_extra,dtype=int) Y2[Y2==10]=0 Y2=Y2.flatten() X1= np.array(data_train) X1=np.rollaxis(X1,3,0) X2= np.array(data_extra) X2=np.rollaxis(X2,3,0) map_size_train = X2.nbytes*4 map_size_val = X1.nbytes*2 #if you want to shuffle your data #random.shuffle(N) env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_val', map_size=map_size_val) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(len(ind_val1)): im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_val1[i]],2,0),Y1[ind_val1[i]]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString()) for i in range(len(ind_val2)): im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_val2[i]],2,0),Y2[ind_val2[i]]) txn.put('{:0>10d}'.format(len(ind_val1)+i), im_dat.SerializeToString()) env = lmdb.open('/home/lisha/school/Projects/hyperband_nnet/hyperband2/svhn/svhn_train', map_size=map_size_train) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(len(ind_train1)): im_dat = caffe.io.array_to_datum(np.rollaxis(X1[ind_train1[i]],2,0),Y1[ind_train1[i]]) txn.put('{:0>10d}'.format(i), im_dat.SerializeToString()) for i in range(len(ind_train2)): im_dat = caffe.io.array_to_datum(np.rollaxis(X2[ind_train2[i]],2,0),Y2[ind_train2[i]]) txn.put('{:0>10d}'.format(len(ind_train1)+i), im_dat.SerializeToString())
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true(p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits: n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) assert_equal(len(train), splits.n_train) assert_equal(len(test), splits.n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(splits.n_train + splits.n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(splits.n_test) / n_samples ex_train_p = float(splits.n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)