我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.train_test_split()。
def trained_models(): dataset = datasets.load_breast_cancer() X = dataset.data y = dataset.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345) rf = RandomForestClassifier() rf.fit(X_train, y_train) lr = LogisticRegression() lr.fit(X_train, y_train) svc_w_linear_kernel = SVC(kernel='linear') svc_w_linear_kernel.fit(X_train, y_train) svc_wo_linear_kernel = SVC() svc_wo_linear_kernel.fit(X_train, y_train) dummy = DummyClassifier() dummy.fit(X_train, y_train) return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel, 'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
def train_model_with_cv(model, params, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # Use Train data to parameter selection in a Grid Search gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5) gs_clf = gs_clf.fit(X_train, y_train) model = gs_clf.best_estimator_ # Use best model and test data for final evaluation y_pred = model.predict(X_test) _f1 = f1_score(y_test, y_pred, average='micro') _confusion = confusion_matrix(y_test, y_pred) __precision = precision_score(y_test, y_pred) _recall = recall_score(y_test, y_pred) _statistics = {'f1_score': _f1, 'confusion_matrix': _confusion, 'precision': __precision, 'recall': _recall } return model, _statistics
def outlier_identification(self, model, x_train, y_train): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print('\nOutlier shapes') print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) model.fit(x_train_split, y_train_split) y_predicted = model.predict(x_test_split) residuals = np.absolute(y_predicted - y_test_split) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) outliers_mask = residuals >= rmse_pred_vs_actual outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask]) not_an_outlier = outliers_mask == 0 # Resample the training set from split, since the set was randomly split x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0) y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0) return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split, y_test_split, title_name): # Split the training data into an extra set of test # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual y') plt.ylabel('Predicted y') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def test_calibrate_final_model_classification(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() # Take a third of our test data (a tenth of our overall data) for calibration df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test, test_size=0.33, random_state=42) column_descriptions = { 'survived': 'output' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, calibrate_final_model=True, X_test=df_titanic_calibration, y_test=df_titanic_calibration.survived) test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived) print('test_score') print(test_score) assert -0.215 < test_score < -0.17
def get_titanic_binary_classification_dataset(basic=True): try: df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv')) except Exception as e: print('Error') print(e) dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv' df_titanic = pd.read_csv(dataset_url) # Do not write the index that pandas automatically creates df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False) df_titanic = df_titanic.drop(['boat', 'body'], axis=1) if basic == True: df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1) df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42) return df_titanic_train, df_titanic_test
def get_twitter_sentiment_multilabel_classification_dataset(): file_name = os.path.join('tests', 'twitter_sentiment.csv') try: df_twitter = pd.read_csv(open(file_name,'rU'), encoding='utf-8', engine='python') except Exception as e: print('Error') print(e) dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv' df_twitter = pd.read_csv(dataset_url) # Do not write the index that pandas automatically creates df_twitter.to_csv(file_name, index=False) # Grab only 10% of the dataset- runs much faster this way df_twitter = df_twitter.sample(frac=0.1) df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created) df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42) return df_twitter_train, df_twitter_test
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42): self.name = name self.X = X self.y = y self.task = task self.random_state = random_state if test_size is not None: self.test_size = test_size self.validation_method = "train_test_split" self.X_train, self.X_test, self.y_train, self.y_test = \ model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state) elif cv is not None: self.validation_method = "cv" if task == "regression": self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state) elif task == "classification": self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def crate_pre_train_model(x_,y_): (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1) (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1) dtrain = xgb.DMatrix( x_train, label=y_train) dtest = xgb.DMatrix( x_test, label=y_test) evallist = [(dtrain,'train'),(dtest,'eval')] param = {'objective':'reg:linear','max_depth':3 } param['nthread'] = 64 #param['min_child_weight'] = 15 #param['subsample'] = 1 #param['num_class'] = 7 plst = param.items() num_round = 5000 bst = xgb.train( plst, dtrain, num_round, evallist,early_stopping_rounds=100, #obj=logregobj, feval=evalerror ) return bst # %% main
def readData(): vector = [] labels = [] indice = 0 for elem in gson: try: actors = gson.get(elem).get("actors") directors = gson.get(elem).get("director") writers = gson.get(elem).get("writer") imdbRating = int(float(gson.get(elem).get("imdbRating"))) mediaAct, mediaDir, mediaWri = calcolaMedie(actors, directors, writers) vect = [1,mediaAct, mediaDir, mediaWri] vector.append(vect) labels.append(int(imdbRating)) ## CAST PER CLASSI DISCRETE ## except Exception: continue data = np.array(vector) labels = np.array(labels) train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.4) return train_data, train_labels, test_data, test_labels
def readData(self): vector = [] labels = [] indice = 0 for elem in gson: actors = gson.get(elem).get("actors") directors = gson.get(elem).get("director") writers = gson.get(elem).get("writer") imdbRating = int(float(gson.get(elem).get("imdbRating"))) mediaAct, mediaDir, mediaWri = self.calcolaMedie(actors, directors, writers) vect = [1,mediaAct, mediaDir, mediaWri] vector.append(vect) labels.append(int(imdbRating)) ## CAST PER CLASSI DISCRETE ## data = np.array(vector) labels = np.array(labels) train_data,test_data,train_labels,test_labels = train_test_split(data,labels, train_size= 0.1) return train_data, train_labels,test_data,test_labels
def metrics_equal(): dataset_path = dpu.generate_equal_dataset() dataset = dpu.load(dataset_path) mm = SGDCModelManager() mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42) mm.train() predicts = mm.predict(mm.x_test) report = classification_report(mm.y_test, predicts) return jsonify(status=200, message=report)
def probabilities_equal(): dataset_path = dpu.generate_equal_dataset() dataset = dpu.load(dataset_path) mm = SGDCModelManager() mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42) mm.train() probabilities = mm.probabilities(mm.x_test) result = [] for i in range(len(mm.y_test)): result.append({ 'probabilities': list(probabilities[i]), 'category': mm.y_test[i] }) return jsonify(status=200, result=result)
def _preload_files_single_volunteer(dataset_dir, speaker_id, view_id, utterance_types): all_videos = path.join(_current_path, 'splits/allVideos.txt') u_list = _gen_utterance_list(utterance_types) with open(all_videos, 'r') as f: contents = f.read().splitlines() video_list = [path.join(dataset_dir, line) for line in contents if 's' + str(speaker_id) + '_' in line if 'v' + str(view_id) in line if any(u in line for u in u_list)] from sklearn.model_selection import train_test_split train, test = train_test_split(video_list, test_size=0.30, random_state=0) return train, test
def get_dataset(dataset_path='Data/Train_Data'): # Getting all data from data path: try: X = np.load('Data/npy_train_data/X.npy') Y = np.load('Data/npy_train_data/Y.npy') except: labels = listdir(dataset_path) # Geting labels X = [] Y = [] for label in labels: datas_path = dataset_path+'/'+label for data in listdir(datas_path): img = get_img(datas_path+'/'+data) X.append(img) Y.append(int(label)) # Create dateset: X = np.array(X).astype('float32')/255. Y = np.array(Y).astype('float32') Y = to_categorical(Y, 2) if not os.path.exists('Data/npy_train_data/'): os.makedirs('Data/npy_train_data/') np.save('Data/npy_train_data/X.npy', X) np.save('Data/npy_train_data/Y.npy', Y) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42) return X, X_test, Y, Y_test
def _preprocess_PBEs(self, PBE_idx=None): """used for most types of shuffles""" # compute PBEs self.PBEs = self._st.bin(ds=self._ds) if self.PBEs.n_epochs == 1: raise ValueError("spike train is continuous, and does not have more than one event!") if PBE_idx is not None: self._trainidx, self._testidx = PBE_idx # tuple unpacking else: # split into train and test data if self._random_state is not None: self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=self._random_state) else: self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=1) self._trainidx.sort() self._testidx.sort() self.PBEs_train = self.PBEs[self._trainidx] self.PBEs_test = self.PBEs[self._testidx]
def load_data(): id2label = {} label2id = {} label_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.label") ) with open(label_path) as f: for row in f: cols = row.strip().split(" ") id2label[int(cols[0])] = cols[1] label2id[cols[1]] = int(cols[0]) data_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.data") ) with open(data_path) as f: rows = f.readlines() n_datas = len(rows) X = np.zeros((n_datas, 8), dtype=np.float32) y = np.zeros(n_datas, dtype=np.int32) for i, row in enumerate(rows): cols = re.split(" +", row.strip()) #print(list(map(float, cols[1:1+8]))) X[i,:] = list(map(float, cols[1:1+8])) y[i] = label2id[cols[-1]] train_idx, test_idx = train_test_split(range(n_datas), random_state=0, train_size=0.7, stratify=y) return (X[train_idx], y[train_idx]), (X[test_idx], y[test_idx])
def get_train_data(corpus, **kwargs): X = [] y = [] documents = corpus.iter_documents() if count: documents = islice(documents, count) for document in tqdm(documents): try: text = document.raw() sents = document.raw_sents() labels = text2labels(text, sents) features = sent2features(text) X.append(features) y.append(labels) except Exception as exc: # TODO: pass return train_test_split(X, y, **kwargs)
def get_pos_train_data(corpus, count=None, **kwargs): X = [] y = [] documents = corpus.iter_documents() if count: documents = islice(documents, count) for document in tqdm(documents): sents = document.iter_tagged_sents() for sent in sents: tokens = [] labels = [] for token, tags in sent: tags = tags.split(',') tokens.append(token) labels.append(tags[0]) # TODO: X.append(sent2posfeatures(tokens)) y.append(labels) return train_test_split(X, y, **kwargs)
def get_train_data(corpus, count=None, **kwargs): X = [] y = [] documents = corpus.iter_documents() if count: documents = islice(documents, count) for document in tqdm(documents): try: text = document.raw() words = document.words() labels = text2labels(text, words) features = list(text2features(text)) X.append(features) y.append(labels) except Exception as exc: # TODO: continue return train_test_split(X, y, **kwargs)
def train_test_split_per_class(X, y, train_size=None, test_size=None): sh = np.array(X.shape) num_classes = len(np.bincount(y)) sh[0] = 0 X_train_arr = np.zeros(sh, dtype=X.dtype) X_test_arr = np.zeros(sh, dtype=X.dtype) y_train_arr = np.zeros((0), dtype=y.dtype) y_test_arr = np.zeros((0), dtype=y.dtype) for i in range(num_classes): X_train, X_test, y_train, y_test = train_test_split(X[y==i], y[y==i], train_size=train_size, test_size=test_size) X_train_arr = np.append(X_train_arr, X_train, axis=0) X_test_arr = np.append(X_test_arr, X_test, axis=0) y_train_arr = np.append(y_train_arr, y_train) y_test_arr = np.append(y_test_arr, y_test) return X_train_arr, X_test_arr, y_train_arr, y_test_arr
def outlier_identification(self, model, x_train, y_train): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print('\nOutlier shapes') print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) model.fit(x_train_split, y_train_split) y_predicted = model.predict(x_test_split) residuals = np.absolute(y_predicted - y_test_split) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) outliers_mask = residuals >= rmse_pred_vs_actual # outliers_mask = np.insert(np.zeros((np.shape(y_train_split)[0],), dtype=np.int), np.shape(y_train_split)[0], # outliers_mask) outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask]) not_an_outlier = outliers_mask == 0 # Resample the training set from split, since the set was randomly split x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0) y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0) return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10) # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, # 0.3, 0.6, 1], cv=10) lasso.fit(x_train_split, y_train_split) y_predicted = lasso.predict(X=x_test_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def fit(self, X, y): if self.use_mspe: lgb_train = lgb.Dataset(X, y, weight=np.ones(X.shape[0]), free_raw_data=False) lgb_test = lgb.Dataset(X, y, reference=lgb_train, weight=np.ones(X.shape[0]), free_raw_data=False) self.gbm = lgb.train( self.kwargs, lgb_train, num_boost_round=10, fobj=mspe, feval=evalerror_lgbm, valid_sets=lgb_test) else: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3) #lgb_test = lgb.Dataset(X, y, reference=lgb_train, # weight=np.ones(X.shape[0]), # free_raw_data=False) self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False) #print "gbm best_iteration=", self.gbm.best_iteration
def validate_formula(formula, training_data, column_being_predicted, cross_val_n=3, validation_size=.10): ''' Accept a formula in the StatsModels.formula.api style, some training data and some test values that must match the value being predicted by the formula. returns: trained_model, cross_scores ''' cross_val_scores = [] for _ in xrange(cross_val_n): X_train, X_test, _, _ = train_test_split( training_data, training_data[column_being_predicted], test_size=validation_size ) model = smf.ols(formula=formula, data=X_train).fit() test_values = X_test[column_being_predicted] score = root_mean_log_squared_error(model, X_test, test_values) cross_val_scores.append(score) return (model, cross_val_scores)
def test_predict_uncertainty_returns_dict_for_one_value(): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5) ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data) test_list = df_boston_test.to_dict('records') for item in test_list: prediction = ml_predictor.predict_uncertainty(item) assert isinstance(prediction, dict)
def test_score_uncertainty(): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5) ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data) uncertainty_score = ml_predictor.score_uncertainty(df_boston_test, df_boston_test.MEDV) print('uncertainty_score') print(uncertainty_score) assert uncertainty_score > -0.2
def get_twitter_sentiment_multilabel_classification_dataset(): file_name = os.path.join('tests', 'twitter_sentiment.csv') try: df_twitter = pd.read_csv(open(file_name,'rU'), encoding='latin-1', engine='python') except Exception as e: print('Error') print(e) dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv' df_twitter = pd.read_csv(dataset_url, encoding='latin-1') # Do not write the index that pandas automatically creates df_twitter.to_csv(file_name, index=False, encoding='latin-1') # Grab only 10% of the dataset- runs much faster this way df_twitter = df_twitter.sample(frac=0.1) df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created) df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42) return df_twitter_train, df_twitter_test
def get_titanic_binary_classification_dataset(basic=True): dir_name = os.path.abspath(os.path.dirname(__file__)) file_name = os.path.join(dir_name, 'titanic.csv') print('file_name') print(file_name) print('dir_name') print(dir_name) try: df_titanic = pd.read_csv(file_name) except Exception as e: print('Error') print(e) dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv' df_titanic = pd.read_csv(dataset_url) # Do not write the index that pandas automatically creates df_titanic.to_csv(file_name, index=False) df_titanic = df_titanic.drop(['boat', 'body'], axis=1) if basic == True: df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1) df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42) return df_titanic_train, df_titanic_test
def get_twitter_sentiment_multilabel_classification_dataset(): file_name = os.path.join('tests', 'twitter_sentiment.h5') try: df_twitter = pd.read_hdf(file_name) except Exception as e: print('Error') print(e) dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv' df_twitter = pd.read_csv(dataset_url, encoding='latin-1') # Do not write the index that pandas automatically creates df_twitter.to_hdf(file_name, key='df', format='fixed') # Grab only 10% of the dataset- runs much faster this way df_twitter = df_twitter.sample(frac=0.1) df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created) df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42) return df_twitter_train, df_twitter_test
def get_data(iris): # Only petal length and petal width considered X = iris.data[:, [2, 3]] y = iris.target # Place the iris data into a pandas dataframe iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:]) # View the data print(iris_df.head()) # Print the classes of the dataset print('\n' + 'The classes in this data are ' + str(np.unique(y))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) print('Training set are {} samples and Test set are {} samples'.format( X_train.shape[0], X_test.shape[0])) print() return(X_train, X_test, y_train, y_test,iris_df, X,y) ##scale the training data before training
def get_data(iris): # Only petal length and petal width considered X = iris.data[:, [2, 3]] y = iris.target # Place the iris data into a pandas dataframe iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:]) # View the data print(iris_df.head()) # Print the classes of the dataset print('\n' + 'The classes in this data are ' + str(np.unique(y))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) print('Training set are {} samples and Test set are {} samples'.format( X_train.shape[0], X_test.shape[0])) print() return(X_train, X_test, y_train, y_test,iris_df, X,y) #scale training data before training
def get_data(iris): # Only petal length and petal width considered X = iris.data[:, [2, 3]] y = iris.target # Place the iris data into a pandas dataframe iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:]) # View the data print(iris_df.head()) # Print the classes of the dataset print('\n' + 'The classes in this data are ' + str(np.unique(y))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) print('Training set are {} samples and Test set are {} samples'.format( X_train.shape[0], X_test.shape[0])) print() return(X_train, X_test, y_train, y_test,iris_df, X,y) ##scale data before training it
def TL(): allurls = './data/data.csv' #path to our all urls file allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe allurlsdata = np.array(allurlsdata) #converting it into an array random.shuffle(allurlsdata) #shuffling y = [d[1] for d in allurlsdata] #all labels corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad) vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer X = vectorizer.fit_transform(corpus) #get the X vector X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio lgs = LogisticRegression() #using logistic regression lgs.fit(X_train, y_train) print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98% return vectorizer, lgs
def de_lda(X,y): """ lda """ dim = X.shape[1] de = min(2000,dim) clf = LDA(n_components = de) _,x_mini,_,y_mini = train_test_split(X,y,test_size = 0.33) clf.fit(x_mini,y_mini) def _func(X1,X2): return clf.transform(X1), clf.transform(X2) return _func # def de_ps(X,y): # """ pearsonr method """ # dim = X.shape[1] # de = min(2000,dim) # clf = SelectKBest(Pearsonr , k=de) # clf.fit(X,y) # def _func(X1,X2): # return clf.transform(X1),clf.transform(X2) # return _func
def prepare_train_data(self): texts,labels = load_corpus() volcabulary, train_words = get_volcabulary_and_list_words(texts) self.set_volcabulary(volcabulary) del volcabulary,texts words_index = self.get_word_index(train_words, self.volcabulary, self.max_words, self.max_length) # del reviews_words, volcabulary index = np.arange(words_index.shape[0]) train_index, valid_index = train_test_split( index, train_size=0.8, random_state=520) train_data = words_index[train_index] valid_data = words_index[valid_index] labels = np.asarray(labels) train_labels = labels[train_index] valid_labels = labels[valid_index] print(train_data.shape) print(valid_data.shape) pickle.dump((words_index, labels), open("output/zh_comments.pkl", 'wb')) return train_data, train_labels, valid_data, valid_labels
def get_train_test_sets(X, y): """ Split X and y into a train and a test sets. Args: X: the TF-IDF matrix where each line represents a document and each column represents a word, typically obtained by running transform_text() from the TP2. y: a binary vector where the i-th value indicates whether the i-th is a spam or a ham. Returns: X_train: train subset of X X_test: test subset of X y_train: train subset of y y_test: test subset of y """ return train_test_split(X, y) # Ex4.2, 4.3, 4.4
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']): est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0) est.fit(X_train,Y_train) y_train_pred = est.predict(X_test) plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data') plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data") plt.title("Linear regression with GDBT") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() print('rmse value:',rmse(Y_test,y_train_pred)) return est
def best_window(self, X_train, y_train, w_min, w_max, t_min,t_max,f_min,f_max): w_opt = 0 t_opt = 0 f_opt = 0 accur_opt = 0. x_w = [] y_accu= [] # range of window : w_min --> w_max for w in range(w_min,w_max+1): #X,y = preprocess_data(w) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) t, f, accur = self.best_forrest(X_train,y_train,10,t_min,t_max,f_min,f_max) print('Window = '+str(w)+' days --> Best Forrest : number of trees : ' + str(t) + ', maximum of features : ' + str(f) + ', with accuracy :' + str(accur)) if (accur > accur_opt) : w_opt, t_opt, f_opt, accur_opt = w, t, f, accur x_w.append(w), y_accu.append(accur) print('Best window : w = '+str(w_opt)+'. Best Forrest : number of trees : ' + str(t_opt) + ', maximum of features : ' + str(f_opt) + ', with accuracy :' + str(accur_opt)) return w_opt, t_opt, f_opt
def prepare_train_test_data(self, data_feature, LabelColumnName): firstloop = 1 for ticker, data in data_feature.items(): X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False) X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.3) # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape) # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape) if firstloop == 1: firstloop = 0 X_train = X_train_temp X_test = X_test_temp y_train = y_train_temp y_test = y_test_temp else: X_train = np.append(X_train, X_train_temp, 0) X_test = np.append(X_test, X_test_temp, 0) y_train = np.append(y_train, y_train_temp, 0) y_test = np.append(y_test, y_test_temp, 0) #print('Train shape X:', X_train.shape, ',y:', y_train.shape) #print('Test shape X:', X_test.shape, ',y:', y_test.shape) return X_train, y_train, X_test, y_test
def prepare_train_test_data(self, data_feature, LabelColumnName): firstloop = 1 for ticker, data in data_feature.items(): X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False) X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2) if firstloop == 1: firstloop = 0 X_train = X_train_temp X_test = X_test_temp y_train = y_train_temp y_test = y_test_temp else: X_train = np.append(X_train, X_train_temp, 0) X_test = np.append(X_test, X_test_temp, 0) y_train = np.append(y_train, y_train_temp, 0) y_test = np.append(y_test, y_test_temp, 0) return X_train, y_train, X_test, y_test
def prepare_train_test_data(self, data_feature, LabelColumnName): firstloop = 1 for ticker, data in data_feature.items(): X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False, array_format=False) X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2) # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape) # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape) if firstloop == 1: firstloop = 0 X_train = X_train_temp X_test = X_test_temp y_train = y_train_temp y_test = y_test_temp else: X_train.append(X_train_temp, ignore_index=True) X_test.append(X_test_temp, ignore_index=True) y_train = np.append(y_train, y_train_temp, 0) y_test = np.append(y_test, y_test_temp, 0) # print('Train shape X:', X_train.shape, ',y:', y_train.shape) # print('Test shape X:', X_test.shape, ',y:', y_test.shape) return X_train, y_train, X_test, y_test
def return_train_dataset(self): """Returns train data set Returns: X (numpy.ndarray): Features y (numpy.ndarray): Labels """ X, y = self.return_main_dataset() if self.test_dataset['method'] == 'split_from_main': X, X_test, y, y_test = train_test_split( X, y, test_size=self.test_dataset['split_ratio'], random_state=self.test_dataset['split_seed'], stratify=y ) return X, y
def gen_train_data(dataset_paths): X_fs = [] Y_fs = [] for path in dataset_paths: images, gts, densities = load_images_and_gts(path) X_fs += images Y_fs += densities from sklearn.model_selection import train_test_split X_fs_train, X_fs_test, Y_fs_train, Y_fs_test = train_test_split(X_fs, Y_fs, test_size = 0.2) X_train, Y_train = X_fs_train, Y_fs_train X_test, Y_test = X_fs_test, Y_fs_test print(len(X_train)) X_train, Y_train = multiscale_pyramidal(X_train, Y_train) #X_train, Y_train = adapt_images_and_densities(X_train, Y_train, slice_w, slice_h) print(len(X_train)) X_train, Y_train = generate_slices(X_train, Y_train, slice_w = patch_w, slice_h = patch_h, offset = 8) print(len(X_train)) #X_train, Y_train = crop_slices(X_train, Y_train) X_train, Y_train = flip_slices(X_train, Y_train) print(len(X_train)) X_train, Y_train = samples_distribution(X_train,Y_train) print(len(X_train)) X_train,Y_train = shuffle_slices(X_train, Y_train) return X_train, Y_train
def main(unused_argv): # Get the data. data_train = np.loadtxt(os.path.join(FLAGS.input_directory,'train'), delimiter=',') data_test = np.loadtxt(os.path.join(FLAGS.input_directory, 'test'), delimiter=',') X_train, X_val, y_train, y_val = train_test_split(data_train[:,1:], data_train[:,0].astype(np.int32), test_size=FLAGS.validation_ratio, random_state=100) X_test = data_test[:, 1:] y_test = data_test[:, 0].astype(np.int32) # Convert to Examples and write the result to TFRecords. convert_to((X_train, y_train), PREFIX + '_train') convert_to((X_val, y_val), PREFIX + '_validation') convert_to((X_test, y_test), PREFIX + '_test')
def classification(): # Generate a random binary classification problem. X, y = make_classification(n_samples=350, n_features=15, n_informative=10, random_state=1111, n_classes=2, class_sep=1., n_redundant=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1111) model = GradientBoostingClassifier(n_estimators=50, max_depth=4, max_features=8, learning_rate=0.1) model.fit(X_train, y_train) predictions = model.predict(X_test) print(predictions) print(predictions.min()) print(predictions.max()) print('classification, roc auc score: %s' % roc_auc_score(y_test, predictions))