Python sklearn.model_selection 模块，train_test_split() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.model_selection.train_test_split()。

项目：triage 作者：dssg | 项目源码 | 文件源码

def trained_models():
    dataset = datasets.load_breast_cancer()
    X = dataset.data
    y = dataset.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    lr = LogisticRegression()
    lr.fit(X_train, y_train)

    svc_w_linear_kernel = SVC(kernel='linear')
    svc_w_linear_kernel.fit(X_train, y_train)

    svc_wo_linear_kernel = SVC()
    svc_wo_linear_kernel.fit(X_train, y_train)

    dummy = DummyClassifier()
    dummy.fit(X_train, y_train)

    return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
            'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}

项目：texta 作者：texta-tk | 项目源码 | 文件源码

def train_model_with_cv(model, params, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # Use Train data to parameter selection in a Grid Search
    gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
    gs_clf = gs_clf.fit(X_train, y_train)
    model = gs_clf.best_estimator_

    # Use best model and test data for final evaluation
    y_pred = model.predict(X_test)

    _f1 = f1_score(y_test, y_pred, average='micro')
    _confusion = confusion_matrix(y_test, y_pred)
    __precision = precision_score(y_test, y_pred)
    _recall = recall_score(y_test, y_pred)
    _statistics = {'f1_score': _f1,
                   'confusion_matrix': _confusion,
                   'precision': __precision,
                   'recall': _recall
                   }

    return model, _statistics

项目：PortfolioTimeSeriesAnalysis 作者：MizioAnd | 项目源码 | 文件源码

def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]

项目：PortfolioTimeSeriesAnalysis 作者：MizioAnd | 项目源码 | 文件源码

def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

项目：auto_ml 作者：doordash | 项目源码 | 文件源码

def test_calibrate_final_model_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test, test_size=0.33, random_state=42)

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }


    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, calibrate_final_model=True, X_test=df_titanic_calibration, y_test=df_titanic_calibration.survived)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17

项目：auto_ml 作者：doordash | 项目源码 | 文件源码

def get_titanic_binary_classification_dataset(basic=True):
    try:
        df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test

项目：auto_ml 作者：doordash | 项目源码 | 文件源码

def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.csv')

    try:
        df_twitter = pd.read_csv(open(file_name,'rU'), encoding='utf-8', engine='python')
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates

        df_twitter.to_csv(file_name, index=False)

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test

项目：stacker 作者：bamine | 项目源码 | 文件源码

def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
        self.name = name
        self.X = X
        self.y = y
        self.task = task
        self.random_state = random_state
        if test_size is not None:
            self.test_size = test_size
            self.validation_method = "train_test_split"
            self.X_train, self.X_test, self.y_train, self.y_test = \
                model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        elif cv is not None:
            self.validation_method = "cv"
            if task == "regression":
                self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
            elif task == "classification":
                self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)

项目：tianchi_power 作者：lvniqi | 项目源码 | 文件源码

def crate_pre_train_model(x_,y_):
    (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
    (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
    dtrain = xgb.DMatrix( x_train, label=y_train)
    dtest = xgb.DMatrix( x_test, label=y_test)
    evallist  = [(dtrain,'train'),(dtest,'eval')]
    param = {'objective':'reg:linear','max_depth':3 }
    param['nthread'] = 64
    #param['min_child_weight'] = 15
    #param['subsample'] = 1
    #param['num_class'] = 7
    plst = param.items()
    num_round = 5000
    bst = xgb.train( plst, dtrain, num_round,
                    evallist,early_stopping_rounds=100,
                    #obj=logregobj,
                    feval=evalerror
                    )
    return bst

# %% main

项目：software-suite-movie-market-analysis 作者：93lorenzo | 项目源码 | 文件源码

def readData():
    vector = []
    labels = []
    indice = 0
    for elem in gson:
        try:
            actors = gson.get(elem).get("actors")
            directors = gson.get(elem).get("director")
            writers = gson.get(elem).get("writer")
            imdbRating = int(float(gson.get(elem).get("imdbRating")))
            mediaAct, mediaDir, mediaWri = calcolaMedie(actors, directors, writers)
            vect = [1,mediaAct, mediaDir, mediaWri]
            vector.append(vect)
            labels.append(int(imdbRating))  ## CAST PER CLASSI DISCRETE ##
        except Exception:
            continue
    data = np.array(vector)
    labels = np.array(labels)
    train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.4)
    return train_data, train_labels, test_data, test_labels

项目：software-suite-movie-market-analysis 作者：93lorenzo | 项目源码 | 文件源码

def readData(self):
        vector = []
        labels = []
        indice = 0
        for elem in gson:
            actors = gson.get(elem).get("actors")
            directors = gson.get(elem).get("director")
            writers = gson.get(elem).get("writer")
            imdbRating = int(float(gson.get(elem).get("imdbRating")))
            mediaAct, mediaDir, mediaWri = self.calcolaMedie(actors, directors, writers)
            vect = [1,mediaAct, mediaDir, mediaWri]
            vector.append(vect)
            labels.append(int(imdbRating))  ## CAST PER CLASSI DISCRETE ##
        data = np.array(vector)
        labels = np.array(labels)
        train_data,test_data,train_labels,test_labels = train_test_split(data,labels, train_size= 0.1)
        return train_data, train_labels,test_data,test_labels

项目：UrbanSearch 作者：urbansearchTUD | 项目源码 | 文件源码

def metrics_equal():
    dataset_path = dpu.generate_equal_dataset()
    dataset = dpu.load(dataset_path)
    mm = SGDCModelManager()

    mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
    mm.train()
    predicts = mm.predict(mm.x_test)

    report = classification_report(mm.y_test, predicts)

    return jsonify(status=200, message=report)

项目：UrbanSearch 作者：urbansearchTUD | 项目源码 | 文件源码

def probabilities_equal():
    dataset_path = dpu.generate_equal_dataset()
    dataset = dpu.load(dataset_path)
    mm = SGDCModelManager()

    mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
    mm.train()
    probabilities = mm.probabilities(mm.x_test)

    result = []
    for i in range(len(mm.y_test)):
        result.append({
            'probabilities': list(probabilities[i]),
            'category': mm.y_test[i]
        })

    return jsonify(status=200, result=result)

项目：pyVSR 作者：georgesterpu | 项目源码 | 文件源码

def _preload_files_single_volunteer(dataset_dir, speaker_id, view_id, utterance_types):

    all_videos = path.join(_current_path, 'splits/allVideos.txt')

    u_list = _gen_utterance_list(utterance_types)

    with open(all_videos, 'r') as f:
        contents = f.read().splitlines()

    video_list = [path.join(dataset_dir, line)
                  for line in contents
                  if 's' + str(speaker_id) + '_' in line
                  if 'v' + str(view_id) in line
                  if any(u in line for u in u_list)]

    from sklearn.model_selection import train_test_split
    train, test = train_test_split(video_list, test_size=0.30, random_state=0)

    return train, test

项目：Controller-Hand 作者：ardamavi | 项目源码 | 文件源码

def get_dataset(dataset_path='Data/Train_Data'):
    # Getting all data from data path:
    try:
        X = np.load('Data/npy_train_data/X.npy')
        Y = np.load('Data/npy_train_data/Y.npy')
    except:
        labels = listdir(dataset_path) # Geting labels
        X = []
        Y = []
        for label in labels:
            datas_path = dataset_path+'/'+label
            for data in listdir(datas_path):
                img = get_img(datas_path+'/'+data)
                X.append(img)
                Y.append(int(label))
        # Create dateset:
        X = np.array(X).astype('float32')/255.
        Y = np.array(Y).astype('float32')
        Y = to_categorical(Y, 2)
        if not os.path.exists('Data/npy_train_data/'):
            os.makedirs('Data/npy_train_data/')
        np.save('Data/npy_train_data/X.npy', X)
        np.save('Data/npy_train_data/Y.npy', Y)
    X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
    return X, X_test, Y, Y_test

项目：nelpy 作者：nelpy | 项目源码 | 文件源码

def _preprocess_PBEs(self, PBE_idx=None):
        """used for most types of shuffles"""
        # compute PBEs
        self.PBEs = self._st.bin(ds=self._ds)

        if self.PBEs.n_epochs == 1:
            raise ValueError("spike train is continuous, and does not have more than one event!")

        if PBE_idx is not None:
            self._trainidx, self._testidx = PBE_idx # tuple unpacking
        else:
            # split into train and test data
            if self._random_state is not None:
                self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=self._random_state)
            else:
                self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=1)

        self._trainidx.sort()
        self._testidx.sort()

        self.PBEs_train = self.PBEs[self._trainidx]
        self.PBEs_test = self.PBEs[self._testidx]

项目：gcForest 作者：kingfengji | 项目源码 | 文件源码

def load_data():
    id2label = {}
    label2id = {}
    label_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.label") )
    with open(label_path) as f:
        for row in f:
            cols = row.strip().split(" ")
            id2label[int(cols[0])] = cols[1]
            label2id[cols[1]] = int(cols[0])

    data_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.data") )
    with open(data_path) as f:
        rows = f.readlines()
    n_datas = len(rows)
    X = np.zeros((n_datas, 8), dtype=np.float32)
    y = np.zeros(n_datas, dtype=np.int32)
    for i, row in enumerate(rows):
        cols = re.split(" +", row.strip())
        #print(list(map(float, cols[1:1+8])))
        X[i,:] = list(map(float, cols[1:1+8]))
        y[i] = label2id[cols[-1]]
    train_idx, test_idx = train_test_split(range(n_datas), random_state=0, train_size=0.7, stratify=y)
    return (X[train_idx], y[train_idx]), (X[test_idx], y[test_idx])

项目：models 作者：bureaucratic-labs | 项目源码 | 文件源码

def get_train_data(corpus, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()

    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        try:
            text = document.raw()
            sents = document.raw_sents()

            labels = text2labels(text, sents)
            features = sent2features(text)

            X.append(features)
            y.append(labels)
        except Exception as exc:
            # TODO:
            pass

    return train_test_split(X, y, **kwargs)

项目：models 作者：bureaucratic-labs | 项目源码 | 文件源码

def get_pos_train_data(corpus, count=None, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()
    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        sents = document.iter_tagged_sents()
        for sent in sents:
            tokens = []
            labels = []
            for token, tags in sent:
                tags = tags.split(',')
                tokens.append(token)
                labels.append(tags[0])  # TODO:
            X.append(sent2posfeatures(tokens))
            y.append(labels)

    return train_test_split(X, y, **kwargs)

项目：models 作者：bureaucratic-labs | 项目源码 | 文件源码

def get_train_data(corpus, count=None, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()
    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        try:
            text = document.raw()
            words = document.words()

            labels = text2labels(text, words)
            features = list(text2features(text))

            X.append(features)
            y.append(labels)
        except Exception as exc:
            # TODO:
            continue

    return train_test_split(X, y, **kwargs)

项目：main 作者：rmkemker | 项目源码 | 文件源码

def train_test_split_per_class(X, y, train_size=None, test_size=None):

    sh = np.array(X.shape)

    num_classes = len(np.bincount(y))

    sh[0] = 0
    X_train_arr =  np.zeros(sh, dtype=X.dtype)
    X_test_arr = np.zeros(sh, dtype=X.dtype)
    y_train_arr = np.zeros((0), dtype=y.dtype)
    y_test_arr = np.zeros((0), dtype=y.dtype)

    for i in range(num_classes):
        X_train, X_test, y_train, y_test = train_test_split(X[y==i], y[y==i],
                                                            train_size=train_size,
                                                            test_size=test_size)

        X_train_arr =  np.append(X_train_arr, X_train, axis=0)
        X_test_arr = np.append(X_test_arr, X_test, axis=0)
        y_train_arr = np.append(y_train_arr, y_train)
        y_test_arr = np.append(y_test_arr, y_test)

    return X_train_arr, X_test_arr, y_train_arr, y_test_arr

项目：HousePrices 作者：MizioAnd | 项目源码 | 文件源码

def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        # outliers_mask = np.insert(np.zeros((np.shape(y_train_split)[0],), dtype=np.int), np.shape(y_train_split)[0],
        #                           outliers_mask)
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]

项目：HousePrices 作者：MizioAnd | 项目源码 | 文件源码

def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        lasso.fit(x_train_split, y_train_split)
        y_predicted = lasso.predict(X=x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

项目：HousePrices 作者：MizioAnd | 项目源码 | 文件源码

def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()

项目：kdd2017 作者：JinpengLI | 项目源码 | 文件源码

def fit(self, X, y):
        if self.use_mspe:
            lgb_train = lgb.Dataset(X, y,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            lgb_test = lgb.Dataset(X, y, reference=lgb_train,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            self.gbm = lgb.train(
                self.kwargs,
                lgb_train,
                num_boost_round=10,
                fobj=mspe,
                feval=evalerror_lgbm,
                valid_sets=lgb_test)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3)
            #lgb_test = lgb.Dataset(X, y, reference=lgb_train,
            #            weight=np.ones(X.shape[0]), 
            #            free_raw_data=False) 
            self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
            #print "gbm best_iteration=", self.gbm.best_iteration

项目：DSI-personal-reference-kit 作者：teb311 | 项目源码 | 文件源码

def validate_formula(formula, training_data, column_being_predicted, cross_val_n=3, validation_size=.10):
    '''
        Accept a formula in the StatsModels.formula.api style, some training data and
        some test values that must match the value being predicted by the formula.

        returns: trained_model, cross_scores
    '''
    cross_val_scores = []
    for _ in xrange(cross_val_n):
        X_train, X_test, _, _ = train_test_split(
            training_data,
            training_data[column_being_predicted],
            test_size=validation_size
        )

        model = smf.ols(formula=formula, data=X_train).fit()
        test_values = X_test[column_being_predicted]
        score = root_mean_log_squared_error(model, X_test, test_values)
        cross_val_scores.append(score)

    return (model, cross_val_scores)