Python sklearn.model_selection 模块,train_test_split() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.train_test_split()

项目:triage    作者:dssg    | 项目源码 | 文件源码
def trained_models():
    dataset = datasets.load_breast_cancer()
    X = dataset.data
    y = dataset.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    lr = LogisticRegression()
    lr.fit(X_train, y_train)

    svc_w_linear_kernel = SVC(kernel='linear')
    svc_w_linear_kernel.fit(X_train, y_train)

    svc_wo_linear_kernel = SVC()
    svc_wo_linear_kernel.fit(X_train, y_train)

    dummy = DummyClassifier()
    dummy.fit(X_train, y_train)

    return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
            'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def train_model_with_cv(model, params, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    # Use Train data to parameter selection in a Grid Search
    gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
    gs_clf = gs_clf.fit(X_train, y_train)
    model = gs_clf.best_estimator_

    # Use best model and test data for final evaluation
    y_pred = model.predict(X_test)

    _f1 = f1_score(y_test, y_pred, average='micro')
    _confusion = confusion_matrix(y_test, y_pred)
    __precision = precision_score(y_test, y_pred)
    _recall = recall_score(y_test, y_pred)
    _statistics = {'f1_score': _f1,
                   'confusion_matrix': _confusion,
                   'precision': __precision,
                   'recall': _recall
                   }

    return model, _statistics
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def test_calibrate_final_model_classification():
    np.random.seed(0)

    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset()

    # Take a third of our test data (a tenth of our overall data) for calibration
    df_titanic_test, df_titanic_calibration = train_test_split(df_titanic_test, test_size=0.33, random_state=42)

    column_descriptions = {
        'survived': 'output'
        , 'embarked': 'categorical'
        , 'pclass': 'categorical'
    }


    ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train, calibrate_final_model=True, X_test=df_titanic_calibration, y_test=df_titanic_calibration.survived)

    test_score = ml_predictor.score(df_titanic_test, df_titanic_test.survived)

    print('test_score')
    print(test_score)

    assert -0.215 < test_score < -0.17
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def get_titanic_binary_classification_dataset(basic=True):
    try:
        df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.csv')

    try:
        df_twitter = pd.read_csv(open(file_name,'rU'), encoding='utf-8', engine='python')
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates

        df_twitter.to_csv(file_name, index=False)

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42):
        self.name = name
        self.X = X
        self.y = y
        self.task = task
        self.random_state = random_state
        if test_size is not None:
            self.test_size = test_size
            self.validation_method = "train_test_split"
            self.X_train, self.X_test, self.y_train, self.y_test = \
                model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        elif cv is not None:
            self.validation_method = "cv"
            if task == "regression":
                self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state)
            elif task == "classification":
                self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def crate_pre_train_model(x_,y_):
    (x_train,x_test) = train_test_split(x_,test_size=0.1,random_state=1)
    (y_train,y_test) = train_test_split(y_,test_size=0.1,random_state=1)
    dtrain = xgb.DMatrix( x_train, label=y_train)
    dtest = xgb.DMatrix( x_test, label=y_test)
    evallist  = [(dtrain,'train'),(dtest,'eval')]
    param = {'objective':'reg:linear','max_depth':3 }
    param['nthread'] = 64
    #param['min_child_weight'] = 15
    #param['subsample'] = 1
    #param['num_class'] = 7
    plst = param.items()
    num_round = 5000
    bst = xgb.train( plst, dtrain, num_round,
                    evallist,early_stopping_rounds=100,
                    #obj=logregobj,
                    feval=evalerror
                    )
    return bst

# %% main
项目:software-suite-movie-market-analysis    作者:93lorenzo    | 项目源码 | 文件源码
def readData():
    vector = []
    labels = []
    indice = 0
    for elem in gson:
        try:
            actors = gson.get(elem).get("actors")
            directors = gson.get(elem).get("director")
            writers = gson.get(elem).get("writer")
            imdbRating = int(float(gson.get(elem).get("imdbRating")))
            mediaAct, mediaDir, mediaWri = calcolaMedie(actors, directors, writers)
            vect = [1,mediaAct, mediaDir, mediaWri]
            vector.append(vect)
            labels.append(int(imdbRating))  ## CAST PER CLASSI DISCRETE ##
        except Exception:
            continue
    data = np.array(vector)
    labels = np.array(labels)
    train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.4)
    return train_data, train_labels, test_data, test_labels
项目:software-suite-movie-market-analysis    作者:93lorenzo    | 项目源码 | 文件源码
def readData(self):
        vector = []
        labels = []
        indice = 0
        for elem in gson:
            actors = gson.get(elem).get("actors")
            directors = gson.get(elem).get("director")
            writers = gson.get(elem).get("writer")
            imdbRating = int(float(gson.get(elem).get("imdbRating")))
            mediaAct, mediaDir, mediaWri = self.calcolaMedie(actors, directors, writers)
            vect = [1,mediaAct, mediaDir, mediaWri]
            vector.append(vect)
            labels.append(int(imdbRating))  ## CAST PER CLASSI DISCRETE ##
        data = np.array(vector)
        labels = np.array(labels)
        train_data,test_data,train_labels,test_labels = train_test_split(data,labels, train_size= 0.1)
        return train_data, train_labels,test_data,test_labels
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def metrics_equal():
    dataset_path = dpu.generate_equal_dataset()
    dataset = dpu.load(dataset_path)
    mm = SGDCModelManager()

    mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
    mm.train()
    predicts = mm.predict(mm.x_test)

    report = classification_report(mm.y_test, predicts)

    return jsonify(status=200, message=report)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def probabilities_equal():
    dataset_path = dpu.generate_equal_dataset()
    dataset = dpu.load(dataset_path)
    mm = SGDCModelManager()

    mm.x_train, mm.x_test, mm.y_train, mm.y_test = train_test_split(dataset['inputs'], dataset['outputs'], random_state=42)
    mm.train()
    probabilities = mm.probabilities(mm.x_test)

    result = []
    for i in range(len(mm.y_test)):
        result.append({
            'probabilities': list(probabilities[i]),
            'category': mm.y_test[i]
        })

    return jsonify(status=200, result=result)
项目:pyVSR    作者:georgesterpu    | 项目源码 | 文件源码
def _preload_files_single_volunteer(dataset_dir, speaker_id, view_id, utterance_types):

    all_videos = path.join(_current_path, 'splits/allVideos.txt')

    u_list = _gen_utterance_list(utterance_types)

    with open(all_videos, 'r') as f:
        contents = f.read().splitlines()

    video_list = [path.join(dataset_dir, line)
                  for line in contents
                  if 's' + str(speaker_id) + '_' in line
                  if 'v' + str(view_id) in line
                  if any(u in line for u in u_list)]

    from sklearn.model_selection import train_test_split
    train, test = train_test_split(video_list, test_size=0.30, random_state=0)

    return train, test
项目:Controller-Hand    作者:ardamavi    | 项目源码 | 文件源码
def get_dataset(dataset_path='Data/Train_Data'):
    # Getting all data from data path:
    try:
        X = np.load('Data/npy_train_data/X.npy')
        Y = np.load('Data/npy_train_data/Y.npy')
    except:
        labels = listdir(dataset_path) # Geting labels
        X = []
        Y = []
        for label in labels:
            datas_path = dataset_path+'/'+label
            for data in listdir(datas_path):
                img = get_img(datas_path+'/'+data)
                X.append(img)
                Y.append(int(label))
        # Create dateset:
        X = np.array(X).astype('float32')/255.
        Y = np.array(Y).astype('float32')
        Y = to_categorical(Y, 2)
        if not os.path.exists('Data/npy_train_data/'):
            os.makedirs('Data/npy_train_data/')
        np.save('Data/npy_train_data/X.npy', X)
        np.save('Data/npy_train_data/Y.npy', Y)
    X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
    return X, X_test, Y, Y_test
项目:nelpy    作者:nelpy    | 项目源码 | 文件源码
def _preprocess_PBEs(self, PBE_idx=None):
        """used for most types of shuffles"""
        # compute PBEs
        self.PBEs = self._st.bin(ds=self._ds)

        if self.PBEs.n_epochs == 1:
            raise ValueError("spike train is continuous, and does not have more than one event!")

        if PBE_idx is not None:
            self._trainidx, self._testidx = PBE_idx # tuple unpacking
        else:
            # split into train and test data
            if self._random_state is not None:
                self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=self._random_state)
            else:
                self._trainidx, self._testidx = train_test_split(np.arange(self.PBEs.n_epochs), test_size=self._test_size, random_state=1)

        self._trainidx.sort()
        self._testidx.sort()

        self.PBEs_train = self.PBEs[self._trainidx]
        self.PBEs_test = self.PBEs[self._testidx]
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def load_data():
    id2label = {}
    label2id = {}
    label_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.label") )
    with open(label_path) as f:
        for row in f:
            cols = row.strip().split(" ")
            id2label[int(cols[0])] = cols[1]
            label2id[cols[1]] = int(cols[0])

    data_path = osp.abspath( osp.join(get_dataset_base(), "uci_yeast", "yeast.data") )
    with open(data_path) as f:
        rows = f.readlines()
    n_datas = len(rows)
    X = np.zeros((n_datas, 8), dtype=np.float32)
    y = np.zeros(n_datas, dtype=np.int32)
    for i, row in enumerate(rows):
        cols = re.split(" +", row.strip())
        #print(list(map(float, cols[1:1+8])))
        X[i,:] = list(map(float, cols[1:1+8]))
        y[i] = label2id[cols[-1]]
    train_idx, test_idx = train_test_split(range(n_datas), random_state=0, train_size=0.7, stratify=y)
    return (X[train_idx], y[train_idx]), (X[test_idx], y[test_idx])
项目:models    作者:bureaucratic-labs    | 项目源码 | 文件源码
def get_train_data(corpus, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()

    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        try:
            text = document.raw()
            sents = document.raw_sents()

            labels = text2labels(text, sents)
            features = sent2features(text)

            X.append(features)
            y.append(labels)
        except Exception as exc:
            # TODO:
            pass

    return train_test_split(X, y, **kwargs)
项目:models    作者:bureaucratic-labs    | 项目源码 | 文件源码
def get_pos_train_data(corpus, count=None, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()
    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        sents = document.iter_tagged_sents()
        for sent in sents:
            tokens = []
            labels = []
            for token, tags in sent:
                tags = tags.split(',')
                tokens.append(token)
                labels.append(tags[0])  # TODO:
            X.append(sent2posfeatures(tokens))
            y.append(labels)

    return train_test_split(X, y, **kwargs)
项目:models    作者:bureaucratic-labs    | 项目源码 | 文件源码
def get_train_data(corpus, count=None, **kwargs):
    X = []
    y = []

    documents = corpus.iter_documents()
    if count:
        documents = islice(documents, count)

    for document in tqdm(documents):
        try:
            text = document.raw()
            words = document.words()

            labels = text2labels(text, words)
            features = list(text2features(text))

            X.append(features)
            y.append(labels)
        except Exception as exc:
            # TODO:
            continue

    return train_test_split(X, y, **kwargs)
项目:main    作者:rmkemker    | 项目源码 | 文件源码
def train_test_split_per_class(X, y, train_size=None, test_size=None):

    sh = np.array(X.shape)

    num_classes = len(np.bincount(y))

    sh[0] = 0
    X_train_arr =  np.zeros(sh, dtype=X.dtype)
    X_test_arr = np.zeros(sh, dtype=X.dtype)
    y_train_arr = np.zeros((0), dtype=y.dtype)
    y_test_arr = np.zeros((0), dtype=y.dtype)

    for i in range(num_classes):
        X_train, X_test, y_train, y_test = train_test_split(X[y==i], y[y==i],
                                                            train_size=train_size,
                                                            test_size=test_size)

        X_train_arr =  np.append(X_train_arr, X_train, axis=0)
        X_test_arr = np.append(X_test_arr, X_test, axis=0)
        y_train_arr = np.append(y_train_arr, y_train)
        y_test_arr = np.append(y_test_arr, y_test)

    return X_train_arr, X_test_arr, y_train_arr, y_test_arr
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        # outliers_mask = np.insert(np.zeros((np.shape(y_train_split)[0],), dtype=np.int), np.shape(y_train_split)[0],
        #                           outliers_mask)
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        lasso.fit(x_train_split, y_train_split)
        y_predicted = lasso.predict(X=x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:kdd2017    作者:JinpengLI    | 项目源码 | 文件源码
def fit(self, X, y):
        if self.use_mspe:
            lgb_train = lgb.Dataset(X, y,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            lgb_test = lgb.Dataset(X, y, reference=lgb_train,
                        weight=np.ones(X.shape[0]), 
                        free_raw_data=False)
            self.gbm = lgb.train(
                self.kwargs,
                lgb_train,
                num_boost_round=10,
                fobj=mspe,
                feval=evalerror_lgbm,
                valid_sets=lgb_test)
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3)
            #lgb_test = lgb.Dataset(X, y, reference=lgb_train,
            #            weight=np.ones(X.shape[0]), 
            #            free_raw_data=False) 
            self.gbm.fit(X, y, early_stopping_rounds=10, eval_set=[(X, y)], verbose=False)
            #print "gbm best_iteration=", self.gbm.best_iteration
项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def validate_formula(formula, training_data, column_being_predicted, cross_val_n=3, validation_size=.10):
    '''
        Accept a formula in the StatsModels.formula.api style, some training data and
        some test values that must match the value being predicted by the formula.

        returns: trained_model, cross_scores
    '''
    cross_val_scores = []
    for _ in xrange(cross_val_n):
        X_train, X_test, _, _ = train_test_split(
            training_data,
            training_data[column_being_predicted],
            test_size=validation_size
        )

        model = smf.ols(formula=formula, data=X_train).fit()
        test_values = X_test[column_being_predicted]
        score = root_mean_log_squared_error(model, X_test, test_values)
        cross_val_scores.append(score)

    return (model, cross_val_scores)
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_titanic_binary_classification_dataset(basic=True):
    try:
        df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def test_predict_uncertainty_returns_dict_for_one_value():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data)

    test_list = df_boston_test.to_dict('records')

    for item in test_list:
        prediction = ml_predictor.predict_uncertainty(item)
        assert isinstance(prediction, dict)
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def test_score_uncertainty():
    np.random.seed(0)

    df_boston_train, df_boston_test = utils.get_boston_regression_dataset()

    column_descriptions = {
        'MEDV': 'output'
        , 'CHAS': 'categorical'
    }

    df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    ml_predictor.train(df_boston_train, perform_feature_selection=True, train_uncertainty_model=True, uncertainty_data=uncertainty_data)

    uncertainty_score = ml_predictor.score_uncertainty(df_boston_test, df_boston_test.MEDV)

    print('uncertainty_score')
    print(uncertainty_score)

    assert uncertainty_score > -0.2
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_titanic_binary_classification_dataset(basic=True):
    try:
        df_titanic = pd.read_csv(os.path.join('tests', 'titanic.csv'))
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(os.path.join('tests', 'titanic.csv'), index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.csv')

    try:
        df_twitter = pd.read_csv(open(file_name,'rU'), encoding='latin-1', engine='python')
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_csv(file_name, index=False, encoding='latin-1')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_titanic_binary_classification_dataset(basic=True):

    dir_name = os.path.abspath(os.path.dirname(__file__))
    file_name = os.path.join(dir_name, 'titanic.csv')
    print('file_name')
    print(file_name)
    print('dir_name')
    print(dir_name)
    try:
        df_titanic = pd.read_csv(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv'
        df_titanic = pd.read_csv(dataset_url)
        # Do not write the index that pandas automatically creates
        df_titanic.to_csv(file_name, index=False)

    df_titanic = df_titanic.drop(['boat', 'body'], axis=1)

    if basic == True:
        df_titanic = df_titanic.drop(['name', 'ticket', 'cabin', 'home.dest'], axis=1)

    df_titanic_train, df_titanic_test = train_test_split(df_titanic, test_size=0.33, random_state=42)
    return df_titanic_train, df_titanic_test
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.h5')

    try:
        df_twitter = pd.read_hdf(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_hdf(file_name, key='df', format='fixed')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
项目:Machine-Learning-Tools-on-Iris-Dataset    作者:debjitpaul    | 项目源码 | 文件源码
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
项目:Machine-Learning-Tools-on-Iris-Dataset    作者:debjitpaul    | 项目源码 | 文件源码
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
#scale training data before training
项目:Machine-Learning-Tools-on-Iris-Dataset    作者:debjitpaul    | 项目源码 | 文件源码
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale the training data before training
项目:Machine-Learning-Tools-on-Iris-Dataset    作者:debjitpaul    | 项目源码 | 文件源码
def get_data(iris):
# Only petal length and petal width considered
    X = iris.data[:, [2, 3]]
    y = iris.target

# Place the iris data into a pandas dataframe
    iris_df = pd.DataFrame(iris.data[:, [2, 3]], columns=iris.feature_names[2:])

# View the data
    print(iris_df.head())

# Print the classes of the dataset
    print('\n' + 'The classes in this data are ' + str(np.unique(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

    print('Training set are {} samples  and Test set are {} samples'.format(
    X_train.shape[0], X_test.shape[0]))
    print()
    return(X_train, X_test, y_train, y_test,iris_df, X,y)
##scale data before training it
项目:Using-machine-learning-to-detect-malicious-URLs    作者:faizann24    | 项目源码 | 文件源码
def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs
项目:NLPWorks    作者:thautwarm    | 项目源码 | 文件源码
def de_lda(X,y):
    """ lda """
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = LDA(n_components = de)
    _,x_mini,_,y_mini = train_test_split(X,y,test_size = 0.33)
    clf.fit(x_mini,y_mini)
    def _func(X1,X2):
        return clf.transform(X1), clf.transform(X2)
    return _func

# def de_ps(X,y):
#     """ pearsonr method """
#     dim = X.shape[1]
#     de = min(2000,dim)
#     clf = SelectKBest(Pearsonr , k=de)
#     clf.fit(X,y)
#     def _func(X1,X2):
#         return clf.transform(X1),clf.transform(X2)
#     return _func
项目:sentiment_comments_zh    作者:zhouhoo    | 项目源码 | 文件源码
def prepare_train_data(self):
        texts,labels = load_corpus()
        volcabulary, train_words = get_volcabulary_and_list_words(texts)

        self.set_volcabulary(volcabulary)

        del volcabulary,texts
        words_index = self.get_word_index(train_words, self.volcabulary, self.max_words, self.max_length)

        # del reviews_words, volcabulary

        index = np.arange(words_index.shape[0])
        train_index, valid_index = train_test_split(
            index, train_size=0.8, random_state=520)
        train_data = words_index[train_index]
        valid_data = words_index[valid_index]
        labels = np.asarray(labels)
        train_labels = labels[train_index]
        valid_labels = labels[valid_index]
        print(train_data.shape)
        print(valid_data.shape)

        pickle.dump((words_index, labels), open("output/zh_comments.pkl", 'wb'))

        return train_data, train_labels, valid_data, valid_labels
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def get_train_test_sets(X, y):
    """ Split X and y into a train and a test sets.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        y: a binary vector where the i-th value indicates whether the i-th is a
           spam or a ham.
    Returns:
        X_train: train subset of X
        X_test: test subset of X
        y_train: train subset of y
        y_test: test subset of y
    """
    return train_test_split(X, y)

# Ex4.2, 4.3, 4.4
项目:House-Pricing    作者:playing-kaggle    | 项目源码 | 文件源码
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']):
    est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    est.fit(X_train,Y_train)
    y_train_pred = est.predict(X_test)
    plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    print('rmse value:',rmse(Y_test,y_train_pred))

    return est
项目:StockRecommendSystem    作者:doncat99    | 项目源码 | 文件源码
def best_window(self, X_train, y_train, w_min, w_max, t_min,t_max,f_min,f_max):
        w_opt = 0
        t_opt = 0
        f_opt = 0
        accur_opt = 0.

        x_w = []
        y_accu= []

        # range of window : w_min --> w_max     
        for w in range(w_min,w_max+1):
            #X,y = preprocess_data(w)
            #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
            t, f, accur = self.best_forrest(X_train,y_train,10,t_min,t_max,f_min,f_max)
            print('Window = '+str(w)+' days --> Best Forrest : number of trees : ' + str(t) + ', maximum of features : ' + str(f) + ', with accuracy :' + str(accur))

            if (accur > accur_opt) : w_opt, t_opt, f_opt, accur_opt = w, t, f, accur
            x_w.append(w), y_accu.append(accur)

        print('Best window : w = '+str(w_opt)+'. Best Forrest : number of trees : ' + str(t_opt) + ', maximum of features : ' + str(f_opt) + ', with accuracy :' + str(accur_opt))
        return w_opt, t_opt, f_opt
项目:StockRecommendSystem    作者:doncat99    | 项目源码 | 文件源码
def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.3)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        #print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        #print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test
项目:StockRecommendSystem    作者:doncat99    | 项目源码 | 文件源码
def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        return X_train, y_train, X_test, y_test
项目:StockRecommendSystem    作者:doncat99    | 项目源码 | 文件源码
def prepare_train_test_data(self, data_feature, LabelColumnName):

        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False, array_format=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)
            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train.append(X_train_temp, ignore_index=True)
                X_test.append(X_test_temp, ignore_index=True)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        # print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        # print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test
项目:xcessiv    作者:reiinakano    | 项目源码 | 文件源码
def return_train_dataset(self):
        """Returns train data set

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        X, y = self.return_main_dataset()

        if self.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=self.test_dataset['split_ratio'],
                random_state=self.test_dataset['split_seed'],
                stratify=y
            )

        return X, y
项目:pytorch_crowd_count    作者:BingzheWu    | 项目源码 | 文件源码
def gen_train_data(dataset_paths):
    X_fs = []
    Y_fs = []

    for path in dataset_paths:
        images, gts, densities = load_images_and_gts(path)
        X_fs += images
        Y_fs += densities
    from sklearn.model_selection import train_test_split
    X_fs_train, X_fs_test, Y_fs_train, Y_fs_test = train_test_split(X_fs, Y_fs, test_size = 0.2)
    X_train, Y_train = X_fs_train, Y_fs_train
    X_test, Y_test = X_fs_test, Y_fs_test
    print(len(X_train))
    X_train, Y_train = multiscale_pyramidal(X_train, Y_train)
    #X_train, Y_train = adapt_images_and_densities(X_train, Y_train, slice_w, slice_h)
    print(len(X_train))
    X_train, Y_train = generate_slices(X_train, Y_train, slice_w = patch_w, slice_h = patch_h, offset = 8)
    print(len(X_train))
    #X_train, Y_train = crop_slices(X_train, Y_train)
    X_train, Y_train = flip_slices(X_train, Y_train)
    print(len(X_train))
    X_train, Y_train = samples_distribution(X_train,Y_train)
    print(len(X_train))
    X_train,Y_train = shuffle_slices(X_train, Y_train)
    return X_train, Y_train
项目:GestureRecognition    作者:gkchai    | 项目源码 | 文件源码
def main(unused_argv):

    # Get the data.
    data_train = np.loadtxt(os.path.join(FLAGS.input_directory,'train'), delimiter=',')
    data_test = np.loadtxt(os.path.join(FLAGS.input_directory, 'test'), delimiter=',')

    X_train, X_val, y_train, y_val = train_test_split(data_train[:,1:], data_train[:,0].astype(np.int32),
                                                          test_size=FLAGS.validation_ratio,
                                                          random_state=100)
    X_test = data_test[:, 1:]
    y_test = data_test[:, 0].astype(np.int32)

    # Convert to Examples and write the result to TFRecords.
    convert_to((X_train, y_train), PREFIX + '_train')
    convert_to((X_val, y_val), PREFIX + '_validation')
    convert_to((X_test, y_test), PREFIX + '_test')
项目:MLAlgorithms    作者:rushter    | 项目源码 | 文件源码
def classification():
    # Generate a random binary classification problem.
    X, y = make_classification(n_samples=350, n_features=15, n_informative=10,
                               random_state=1111, n_classes=2,
                               class_sep=1., n_redundant=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                        random_state=1111)

    model = GradientBoostingClassifier(n_estimators=50, max_depth=4,
                                       max_features=8, learning_rate=0.1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(predictions)
    print(predictions.min())
    print(predictions.max())
    print('classification, roc auc score: %s'
          % roc_auc_score(y_test, predictions))