Python sklearn.ensemble 模块,RandomForestRegressor() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.RandomForestRegressor()

项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def cross_validate_best_known():
    '''
        import and clean the tractor data, then do a corss validation on each of the three models we are
        training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
        the scores.

        The parameters we're using here are the "best" that we've found so far using a grid search.
    '''
    tractor_data = pd.read_csv('data/train.csv')
    tractor_data = cln.clean_all(tractor_data)
    X = tractor_data
    y = tractor_data.pop('SalePrice')

    rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
    gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
    ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
    ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)

    validate.cross_v_scores([rf, gb, ab], X, y)
    # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
    # GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
    # AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_stacked_regressor(self):
        bclf = LinearRegression()
        clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
                GradientBoostingRegressor(n_estimators=25, random_state=1),
                Ridge(random_state=1)]

        # Friedman1
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        X_test, y_test = X[200:], y[200:]

        sr = StackedRegressor(bclf,
                              clfs,
                              n_folds=3,
                              verbose=0,
                              oob_score_flag=True)
        sr.fit(X_train, y_train)
        mse = mean_squared_error(y_test, sr.predict(X_test))
        assert_less(mse, 6.0)
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_fwls_regressor(self):
        feature_func = lambda x: np.ones(x.shape)
        bclf = LinearRegression()
        clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
                GradientBoostingRegressor(n_estimators=25, random_state=1),
                Ridge(random_state=1)]

        # Friedman1
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        X_test, y_test = X[200:], y[200:]

        sr = FWLSRegressor(bclf,
                              clfs,
                              feature_func,
                              n_folds=3,
                              verbose=0,
                              oob_score_flag=True)
        sr.fit(X_train, y_train)
        mse = mean_squared_error(y_test, sr.predict(X_test))
        assert_less(mse, 6.0)
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def model_random_forecast(Xtrain,Xtest,ytrain):

    X_train = Xtrain
    y_train = ytrain
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {'n_estimators': [1000]}
    # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def rfr_feature_select():
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.cross_validation import cross_val_score, ShuffleSplit

    boston = load_boston()
    X = boston["data"]
    Y = boston["target"]
    names = boston["feature_names"]

    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i + 1],
                                Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))

    print sorted(scores, reverse=True)
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
项目:XTREE    作者:ai-se    | 项目源码 | 文件源码
def rforest2(train, test, tunings=None, smoteit=True, duplicate=True):
  "RF "
  # Apply random forest Classifier to predict the number of bugs.
  if smoteit:
    train = SMOTE(train, atleast=50, atmost=101, resample=duplicate)
  if not tunings:
    clf = RandomForestRegressor(n_estimators=100, random_state=1)
  else:
    clf = RandomForestRegressor(n_estimators=int(tunings[0]),
                                max_features=tunings[1] / 100,
                                min_samples_leaf=int(tunings[2]),
                                min_samples_split=int(tunings[3])
                                )
  train_DF = formatData(train)
  test_DF = formatData(test)
  features = train_DF.columns[:-2]
  klass = train_DF[train_DF.columns[-2]]
  # set_trace()
  clf.fit(train_DF[features], klass)
  preds = clf.predict(test_DF[test_DF.columns[:-2]])
  return preds
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_regressor(self):
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        index = [i for i in range(200)]

        rf = RandomForestRegressor()
        jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
        jrf.fit(X_train, y_train, index)
        prediction = jrf.predict(X_train, index)
        mse = mean_squared_error(y_train, prediction)
        assert_less(mse, 6.0)

        rf = RandomForestRegressor(n_estimators=20)
        jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
        jrf.fit(X_train, y_train, index)
        prediction2 = jrf.predict(X_train, index)
        assert_allclose(prediction, prediction2)
项目:time_series_modeling    作者:rheineke    | 项目源码 | 文件源码
def unscaled_pipelines():
    # Random forest parameters
    random_forest_kwargs = {
        'n_estimators': 10,
        'criterion': 'mse',
        'random_state': _RANDOM_STATE,
        'n_jobs': cpu_count(),
        'verbose': True,
    }
    # Gradient boosting parameters
    gradient_boost_kwargs = {
        'random_state': _RANDOM_STATE,
        'verbose': 1,
    }
    models = [
        DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
        # RandomForestRegressor(**random_forest_kwargs),
        # GradientBoostingRegressor(**gradient_boost_kwargs),
    ]
    pipelines = []
    for m in models:
        # Steps
        pipelines.append(make_pipeline(m))
    return pipelines
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        #Set the parameters by cross-validation
        tuned_parameters = [{'max_depth': range(20,60),
                             'n_estimators': range(10,40),
                             'max_features': ['sqrt', 'log2', None]
                             }
                            ]

        clf = GridSearchCV(RandomForestRegressor(n_estimators=30), tuned_parameters, cv=5, scoring='mean_squared_error')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "MSE for test data set:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print mean_squared_error(y_true, y_pred)
项目:Brain_Tumor_Segmentation    作者:KarthikRevanuru    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
    p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])

    y=np.array([])
    t=0
    z=np.array([])
    for ind in range(len(folder_names_train)):
        try:
            temp = df.get_value(str(folder_names_train[ind]),'Survival')
            y=np.append(y,temp)
            temp = df.get_value(str(folder_names_train[ind]),'Age')
            z=np.append(z,np.array([temp]))
        except Exception as e:
            t+=1 
            print (t,str(e),"Label Not found, deleting entry")
            y=np.append(y,0)

    z=np.array([[v] for v in z])

    t=np.concatenate((p,q),axis=1)
    u=np.concatenate((r,s),axis=1)
    x=np.concatenate((t,u),axis=1) 
    #print(x.shape)
    #print (x)
    #print (x.shape,z.shape)
    x=np.concatenate((x,z),axis=1)
    #print (x)
    #clf=linear_model.LogisticRegression(C=1e5)
    #clf = RandomForestRegressor()
    clf = xgb.XGBRegressor()
    clf.fit(x,y)
    return clf
项目:pyGPGO    作者:hawk31    | 项目源码 | 文件源码
def fit(self, X, y):
        """
        Fit a Random Forest model to data `X` and targets `y`.

        Parameters
        ----------
        X : array-like
            Input values.
        y: array-like
            Target values.
        """
        self.X = X
        self.y = y
        self.n = self.X.shape[0]
        self.model = RandomForestRegressor(**self.params)
        self.model.fit(X, y)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_random_forest_regressor(self):
        for dtype in self.number_data_type.keys():
            scikit_model = RandomForestRegressor(random_state=1)
            data = self.scikit_data['data'].astype(dtype)
            target = self.scikit_data['target'].astype(dtype)
            scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
            test_data = data[0].reshape(1, -1)
            self._check_tree_model(spec, 'multiArrayType', 'doubleType', 1)
            coreml_model = create_model(spec)
            try:
                self.assertEqual(scikit_model.predict(test_data)[0].dtype,
                                 type(coreml_model.predict({'data': test_data})['target']))
                self.assertAlmostEqual(scikit_model.predict(test_data)[0],
                                       coreml_model.predict({'data': test_data})['target'],
                                       msg="{} != {} for Dtype: {}".format(
                                           scikit_model.predict(test_data)[0],
                                           coreml_model.predict({'data': test_data})['target'],
                                           dtype
                                       )
                                       )
            except RuntimeError:
                print("{} not supported. ".format(dtype))
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def _train_convert_evaluate(self, **scikit_params):
        """
        Train a scikit-learn model, convert it and then evaluate it with CoreML
        """
        scikit_model = RandomForestRegressor(random_state = 1, **scikit_params)
        scikit_model.fit(self.X, self.target)

        # Convert the model
        spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name)

        # Get predictions
        df = pd.DataFrame(self.X, columns=self.feature_names)
        df['prediction'] = scikit_model.predict(self.X)

        # Evaluate it
        metrics = evaluate_regressor(spec, df, verbose = False)
        return metrics
项目:mlens    作者:flennerhag    | 项目源码 | 文件源码
def build_ensemble(**kwargs):
    """Generate ensemble."""

    ens = SuperLearner(**kwargs)
    prep = {'Standard Scaling': [StandardScaler()],
            'Min Max Scaling': [MinMaxScaler()],
            'No Preprocessing': []}

    est = {'Standard Scaling':
               [ElasticNet(), Lasso(), KNeighborsRegressor()],
           'Min Max Scaling':
               [SVR()],
           'No Preprocessing':
               [RandomForestRegressor(random_state=SEED),
                GradientBoostingRegressor()]}

    ens.add(est, prep)

    ens.add(GradientBoostingRegressor(), meta=True)

    return ens
项目:Semantic-Texual-Similarity-Toolkits    作者:rgtjf    | 项目源码 | 文件源码
def train_model(self, train_file_path, model_path):
        print("==> Load the data ...")
        X_train, Y_train = self.load_file(train_file_path)
        print(train_file_path, shape(X_train))

        print("==> Train the model ...")
        min_max_scaler = preprocessing.MaxAbsScaler()
        X_train_minmax = min_max_scaler.fit_transform(X_train)
        clf = RandomForestRegressor(n_estimators=self.n_estimators)
        clf.fit(X_train_minmax.toarray(), Y_train)

        print("==> Save the model ...")
        pickle.dump(clf, open(model_path, 'wb'))

        scaler_path = model_path.replace('.pkl', '.scaler.pkl')
        pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
        return clf
项目:libskeletal    作者:bobbybee    | 项目源码 | 文件源码
def trainModel(featureCount, imageCount, save):
    clf = RandomForestRegressor(n_estimators=1, n_jobs=-1)

    features = generateFeatures(featureCount)

    for image in range(0, imageCount):
        print "Image " + str(image)
        train(clf, features, image)

    clf = clf.fit(X, Y)
    model = (clf, features)

    if save:
        joblib.dump(model, "model.pkl")

    return model
项目:DataAnalysis    作者:IMYin    | 项目源码 | 文件源码
def set_missing_ages(df):
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    y = known_age[:, 0]
    X = known_age[:, 1:]

    # fit by RamdomForestRegressor
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)

    # predict the unknown age
    predictedAges = rfr.predict(unknown_age[:, 1:])
    # backfill the value of unknown age
    df.loc[(df.Age.isnull()), 'Age'] = predictedAges

    return df, rfr
项目:DataAnalysis    作者:IMYin    | 项目源码 | 文件源码
def set_missing_ages(df):
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    y = known_age[:, 0]
    X = known_age[:, 1:]

    # fit by RamdomForestRegressor
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)

    # predict the unknown age
    predictedAges = rfr.predict(unknown_age[:, 1:])
    # backfill the value of unknown age
    df.loc[(df.Age.isnull()), 'Age'] = predictedAges

    return df, rfr


# processing the column : Cabin
项目:scikit-optimize    作者:scikit-optimize    | 项目源码 | 文件源码
def test_gbrt_base_estimator():
    rng = np.random.RandomState(1)
    N = 10000
    X = np.ones((N, 1))
    y = rng.normal(size=N)

    base = RandomForestRegressor()
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    assert_raise_message(ValueError, 'type GradientBoostingRegressor',
                         rgr.fit, X, y)

    base = GradientBoostingRegressor()
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y)

    base = GradientBoostingRegressor(loss='quantile', n_estimators=20)
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    rgr.fit(X, y)

    estimates = rgr.predict(X, return_quantiles=True)
    assert_almost_equal(stats.norm.ppf(rgr.quantiles),
                        np.mean(estimates, axis=0),
                        decimal=2)
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def rf1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 300
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    for n, (itrain, ival) in enumerate(skf.split(train2, y)):
        print('step %d of %d'%(n+1, skf.n_splits), now())
        clf = ensemble.RandomForestRegressor(n_estimators=1000,
                                             max_depth=3,
                                             random_state=13)
        clf.fit(train2[itrain], y[itrain])

        p = clf.predict(train2[ival])
        v.loc[ival, cname] += p
        score = metrics.log_loss(y[ival], p)
        z[cname]  += np.log1p(clf.predict(test2))
        print(cname, 'step %d: score'%(n+1), score, now())
        scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_RandomForestRegressor_num(*data):
    '''
    test the performance with different n_estimators
    :param data: train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    nums=np.arange(1,100,step=2)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for num in nums:
        regr=ensemble.RandomForestRegressor(n_estimators=num)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(nums,training_scores,label="Training Score")
    ax.plot(nums,testing_scores,label="Testing Score")
    ax.set_xlabel("estimator num")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(-1,1)
    plt.suptitle("RandomForestRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_RandomForestRegressor_max_depth(*data):
    '''
    test the performance with different max_depth
    :param data:  train_data, test_data, train_value, test_value
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    maxdepths=range(1,20)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for max_depth in maxdepths:
        regr=ensemble.RandomForestRegressor(max_depth=max_depth)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(maxdepths,training_scores,label="Training Score")
    ax.plot(maxdepths,testing_scores,label="Testing Score")
    ax.set_xlabel("max_depth")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(0,1.05)
    plt.suptitle("RandomForestRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_RandomForestRegressor_max_features(*data):
    '''
    test the performance with different max_features
    :param data:  train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    max_features=np.linspace(0.01,1.0)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for max_feature in max_features:
        regr=ensemble.RandomForestRegressor(max_features=max_feature)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(max_features,training_scores,label="Training Score")
    ax.plot(max_features,testing_scores,label="Testing Score")
    ax.set_xlabel("max_feature")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(0,1.05)
    plt.suptitle("RandomForestRegressor")
    plt.show()
项目:Black-Swan    作者:12190143    | 项目源码 | 文件源码
def rf(train_sample, validation_sample, features, seed):
    log_base = np.e
    rf_est = RandomForestRegressor(n_estimators=500,
                                   criterion='mse',
                                   max_features=4,
                                   max_depth=None,
                                   bootstrap=True,
                                   min_samples_split=4,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0,
                                   max_leaf_nodes=None,
                                   random_state=seed
                                   ).fit(
        train_sample[features], np.log1p(train_sample['volume']) / np.log(log_base))
    rf_prob = np.power(log_base, rf_est.predict(validation_sample[features])) - 1
    print_mape(validation_sample['volume'], rf_prob, 'RF')
    return rf_prob
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_check_consistent_length():
    check_consistent_length([1], [2], [3], [4], [5])
    check_consistent_length([[1, 2], [[1, 2]]], [1, 2], ['a', 'b'])
    check_consistent_length([1], (2,), np.array([3]), sp.csr_matrix((1, 2)))
    assert_raises_regexp(ValueError, 'inconsistent numbers of samples',
                         check_consistent_length, [1, 2], [1])
    assert_raises_regexp(TypeError, 'got <\w+ \'int\'>',
                         check_consistent_length, [1, 2], 1)
    assert_raises_regexp(TypeError, 'got <\w+ \'object\'>',
                         check_consistent_length, [1, 2], object())

    assert_raises(TypeError, check_consistent_length, [1, 2], np.array(1))
    # Despite ensembles having __len__ they must raise TypeError
    assert_raises_regexp(TypeError, 'estimator', check_consistent_length,
                         [1, 2], RandomForestRegressor())
    # XXX: We should have a test with a string, but what is correct behaviour?
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params ):

    n_estimators = int( round( n_iterations * trees_per_iteration ))
    print "n_estimators:", n_estimators
    pprint( params )

    clf = RF( n_estimators = n_estimators, verbose = 0, n_jobs = -1, **params )

    return train_and_eval_sklearn_regressor( clf, data )
项目:stacker    作者:bamine    | 项目源码 | 文件源码
def __init__(self, task: Task, scorer: Scorer, opt_logger: OptimizationLogger=VoidLogger(None)):
        if task.task == "classification":
            space = RandomForestOptimizer.Params.classification_space
            model = ensemble.RandomForestClassifier()
        else:
            space = RandomForestOptimizer.Params.regression_space
            model = ensemble.RandomForestRegressor()
        super().__init__(model, task, space, scorer, opt_logger)
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
#         min_samples_split = 3
#         self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.3, min_samples_split =1, verbose=100, n_jobs=-1)
        self.clf = RandomForestRegressor(n_estimators = 100, max_features = 0.8)
        return
项目:POWER    作者:pennelise    | 项目源码 | 文件源码
def machine_learning_RF(x_train,y_train,x_test,y_test):
    import numpy as np
    mask = []

    #Gets rid of NaNs
    for i in range(np.shape(x_train)[1]):
        mask.append(~np.isnan(x_train[:,i]))
    mask.append(~np.isnan(np.transpose(y_train)))  
    mask = np.transpose(reduce(np.logical_and, mask))
    mask = mask.reshape(len(mask),)

    inputs = x_train[mask,:]
    targets = y_train[mask]

    mask2 = []
    for i in range(np.shape(x_test)[1]):
        mask2.append(~np.isnan(x_test[:,i]))  
    mask2 = np.transpose(reduce(np.logical_and, mask2))
    inputs_test = x_test[mask2,:]
    #End getting rid of NaNs

    #Sets up forest
    #n-estimators is how many "trees" (samples) you will take
    from sklearn.ensemble import RandomForestRegressor
    rfc_new = RandomForestRegressor(n_estimators=100,random_state=42,max_features=2)
    #Training
    rfc_new = rfc_new.fit(inputs,targets)
    #Predicting
    predicted_y = rfc_new.predict(inputs_test)
    print rfc_new.feature_importances_    
    return y_test[mask2], predicted_y
项目:coremltools    作者:gsabran    | 项目源码 | 文件源码
def convert(model, feature_names, target):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : RandomForestRegressor
        A trained scikit-learn tree model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _ensemble.RandomForestRegressor)
    def is_rf_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_:
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False
    _sklearn_util.check_fitted(model, is_rf_model)
    return _MLModel(_convert_tree_ensemble(model, feature_names, target))
项目:time_series_modeling    作者:rheineke    | 项目源码 | 文件源码
def persist_pipelines(pipelines):
    Path('models').mkdir(exist_ok=True)
    fp_fmt = 'models/{}-{:%y-%m-%d}.pkl'
    now = dt.datetime.now()
    for pipe in pipelines:
        print(utils.pipeline_name(pipe))
        fp_name = fp_fmt.format(utils.pipeline_name(pipe), now)
        joblib.dump(pipe, fp_name)
        # Pickle fails to work on RandomForestRegressor
        # with open(fp_name, 'wb') as fp:
        #     pickle.dump(pipe, fp)
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def model_cross_valid(X,Y):
    seed = 7
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    def bulid_model(model_name):
        model = model_name()
        return model
    scoring = 'neg_mean_squared_error'
    # + random fest boost lstm gbdt

    for model_name in [LinearRegression,ElasticNet]:
    #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
        model = bulid_model(model_name)
        results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        print(model_name,results.mean())
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def __init__(self, isTrain):
        super(RegressionRandomForest, self).__init__(isTrain)
        # data preprocessing
        #self.dataPreprocessing()

        # Create linear regression object
        self.model = RandomForestRegressor(max_features='sqrt', n_estimators=32, max_depth=39)
项目:nirdizati-runtime    作者:nirdizati    | 项目源码 | 文件源码
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"):

        self.case_id_col = case_id_col
        self.nr_events = nr_events

        self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs)

        if cls_method == "gbm":
            self.cls = GradientBoostingRegressor(**cls_kwargs)
        elif cls_method == "rf":
            self.cls = RandomForestRegressor(**cls_kwargs)
        else:
            print("Classifier method not known")
项目:f1_2017    作者:aflaisler    | 项目源码 | 文件源码
def fastLapModel(xList, labels, names, multiple=0, full_set=0):
    X = numpy.array(xList)
    y = numpy.array(labels)
    featureNames = []
    featureNames = numpy.array(names)
    # take fixed holdout set 30% of data rows
    xTrain, xTest, yTrain, yTest = train_test_split(
        X, y, test_size=0.30, random_state=531)
    # for final model (no CV)
    if full_set:
        xTrain = X
        yTrain = y
    check_set(xTrain, xTest, yTrain, yTest)
    print "Fitting the model to the data set..."
    # train random forest at a range of ensemble sizes in order to see how the
    # mse changes
    mseOos = []
    m = 10 ** multiple
    nTreeList = range(500 * m, 1000 * m, 100 * m)
    # iTrees = 10000
    for iTrees in nTreeList:
        depth = None
        maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1  # try tweaking
        RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
                                              oob_score=False, random_state=531, n_jobs=-1)
        # RFmd.n_features = 5
        RFmd.fit(xTrain, yTrain)

        # Accumulate mse on test set
        prediction = RFmd.predict(xTest)
        mseOos.append(mean_squared_error(yTest, prediction))
    # plot training and test errors vs number of trees in ensemble
    plot.plot(nTreeList, mseOos)
    plot.xlabel('Number of Trees in Ensemble')
    plot.ylabel('Mean Squared Error')
    #plot.ylim([0.0, 1.1*max(mseOob)])
    plot.show()
    print("MSE")
    print(mseOos[-1])
    return xTrain, xTest, yTrain, yTest, RFmd
项目:CryptoBot    作者:AdeelMufti    | 项目源码 | 文件源码
def fit_forest(X, y, window=100000, estimators=100,
               samples_leaf=250, validate=True):
    '''
    Fits Random Forest
    '''
    model = RandomForestRegressor(n_estimators=estimators,
                                  min_samples_leaf=samples_leaf,
                                  random_state=42,
                                  n_jobs=-1)
    if validate:
        return cross_validate(X, y, model, window)
    return model.fit(X, y)
项目:pyGPGO    作者:hawk31    | 项目源码 | 文件源码
def __init__(self, **params):
        """
        Wrapper around sklearn's Random Forest implementation for pyGPGO.
        Random Forests can also be used for surrogate models in Bayesian Optimization.
        An estimate of 'posterior' variance can be obtained by using the `impurity`
        criterion value in each subtree.

        Parameters
        ----------
        params: tuple, optional
            Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.

        """
        self.params = params
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_RF_model(file_name):
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Random Forest Regression Model...'
    start_time  = datetime.datetime.now()
    rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced')
    rf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time-start_time).seconds

    print 'Save Model...'
    joblib.dump(rf, 'RF.model')
    return rf
项目:SMAC3    作者:automl    | 项目源码 | 文件源码
def rf_from_cfg(cfg, seed):
    """
        Creates a random forest regressor from sklearn and fits the given data on it.
        This is the function-call we try to optimize. Chosen values are stored in
        the configuration (cfg).

        Parameters:
        -----------
        cfg: Configuration
            configuration chosen by smac
        seed: int or RandomState
            used to initialize the rf's random generator

        Returns:
        -----------
        np.mean(rmses): float
            mean of root mean square errors of random-forest test predictions
            per cv-fold
    """
    rfr = RandomForestRegressor(
        n_estimators=cfg["num_trees"],
        criterion=cfg["criterion"],
        min_samples_split=cfg["min_samples_to_split"],
        min_samples_leaf=cfg["min_samples_in_leaf"],
        min_weight_fraction_leaf=cfg["min_weight_frac_leaf"],
        max_features=cfg["max_features"],
        max_leaf_nodes=cfg["max_leaf_nodes"],
        bootstrap=cfg["do_bootstrapping"],
        random_state=seed)

    def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))
    # Creating root mean square error for sklearns crossvalidation
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    score = cross_val_score(rfr, boston.data, boston.target, cv=11, scoring=rmse_scorer)
    return -1 * np.mean(score)  # Because cross_validation sign-flips the score
项目:guacml    作者:guacml    | 项目源码 | 文件源码
def train(self, x, y,
              n_estimators=10,
              max_depth=None,
              min_samples_leaf=1):
        n_estimators = self.to_int(n_estimators)
        max_depth = self.to_int(max_depth)
        min_samples_leaf = self.pos_int(min_samples_leaf)

        if self.problem_type == ProblemType.BINARY_CLAS:
            self.model = RandomForestClassifier(n_estimators,
                                                max_depth=max_depth,
                                                min_samples_leaf=min_samples_leaf)
        elif self.problem_type == ProblemType.REGRESSION:
            self.model = RandomForestRegressor(n_estimators,
                                               max_depth=max_depth,
                                               min_samples_leaf=min_samples_leaf)
        else:
            raise NotImplementedError('Problem type {0} not implemented'.format(self.problem_type))

        self.model.fit(x, y)
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self):
        #if self.modeltype == "AR" :
        #    return statsmodels.tsa.ar_model.AR(max_order=self.parameters['max_order'])
        if self.modeltype == "RandomForest" :
            return ensemble.RandomForestRegressor(n_estimators=self.parameters['n_estimators'])
            #return ensemble.RandomForestClassifier(
            #    n_estimators=self.parameters['n_estimators'])
        elif self.modeltype == "LinearRegression" :
            return linear_model.LinearRegression()
        elif self.modeltype == "Lasso" :
            return linear_model.Lasso(
            alpha=self.parameters['alpha'])
        elif self.modeltype == "ElasticNet" :
            return linear_model.ElasticNet(
            alpha=self.parameters['alpha'],
            l1_ratio=self.parameters['l1_ratio'])
        elif self.modeltype == "SVR" :
            return SVR(
            C=self.parameters['C'],
            epsilon=self.parameters['epsilon'],
            kernel=self.parameters['kernel'])
        #elif self.modeltype == 'StaticModel':
        #   return StaticModel (
        #      parameters=self.parameters
        #     )
        #elif self.modeltype == 'AdvancedStaticModel':
        #   return AdvancedStaticModel (
        #       parameters=self.parameters
        #        )

        # elif self.modeltype == 'SGDRegressor' :
        #     print(self.parameters)
        #     return linear_model.SGDRegressor(
        #     loss=self.parameters['loss'],
        #     penalty=self.parameters['penalty'],
        #     l1_ratio=self.parameters['l1_ratio'])
        else:
            raise ConfigError("Unsupported model {0}".format(self.modeltype))
项目:fluentopt    作者:mehdidc    | 项目源码 | 文件源码
def predict(self, X, return_std=False):
        if return_std:
            trees = self.estimators_
            y = np.concatenate([tree.predict(X)[np.newaxis, :] for tree in trees], axis=0)
            mean = y.mean(axis=0)
            std = y.std(axis=0)
            return mean, std
        else:
            return super(RandomForestRegressor, self).predict(X)
项目:datasciences    作者:BenChehade    | 项目源码 | 文件源码
def greedy_elim(df):

    # do feature selection using boruta
    X = df[[x for x in df.columns if x!='SalePrice']]
    y = df['SalePrice']
    #model = RandomForestRegressor(n_estimators=50)
    model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05)
    # 150 features seems to be the best at the moment. Why this is is unclear.
    feat_selector = RFE(estimator=model, step=1, n_features_to_select=150)

    # find all relevant features
    feat_selector.fit_transform(X.as_matrix(), y.as_matrix())

    # check selected features
    features_bool = np.array(feat_selector.support_)
    features = np.array(X.columns)
    result = features[features_bool]
    #print(result)

    # check ranking of features
    features_rank = feat_selector.ranking_
    #print(features_rank)
    rank = features_rank[features_bool]
    #print(rank)

    return result
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
    def bulid_model(model_name):
        model = model_name()
        return model
    #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
    for model_name in [LinearRegression, ElasticNet]:
        model = bulid_model(model_name)
        model.fit(TrainX,TrainY)
        print(model_name)
        resid = model.predict(TestX) - TestY
        #print resid
        print("Residual sum of squares: %f"% np.mean(resid ** 2))
        #print model.predict(TestX)
        #print TestY
        # Explained variance score: 1 is perfect prediction
        plt.scatter(model.predict(TestX), resid);
        plt.axhline(0, color='red')
        plt.xlabel('Predicted Values')
        plt.ylabel('Residuals')
        #plt.xlim([1, 50])
        plt.show()

        print('Variance score: %.2f' % model.score(TestX, TestY))

        from statsmodels.stats.stattools import jarque_bera
        _, pvalue, _, _ = jarque_bera(resid)
        print ("Test Residuals Normal", pvalue)

        from statsmodels import regression, stats
        import statsmodels.api as sms
        import statsmodels.stats.diagnostic as smd
        # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
        xs_with_constant = sms.add_constant(TestX)
        _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
        print ("Test Heteroskedasticity", pvalue1)
        ljung_box = smd.acorr_ljungbox(resid, lags=10)

        #print "Lagrange Multiplier Statistics:", ljung_box[0]
        print "Test Autocorrelation P-values:", ljung_box[1]
        if any(ljung_box[1] < 0.05):
            print "The residuals are autocorrelated."
        else:
            print "The residuals are not autocorrelated."
项目:pyGPGO    作者:hawk31    | 项目源码 | 文件源码
def __init__(self, **params):
        """
        Wrapper around sklearn's ExtraTreesRegressor implementation for pyGPGO.
        Random Forests can also be used for surrogate models in Bayesian Optimization.
        An estimate of 'posterior' variance can be obtained by using the `impurity`
        criterion value in each subtree.

        Parameters
        ----------
        params: tuple, optional
            Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.

        """
        self.params = params
项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def random_forest_grid_search():
    random_forest_grid = {
        'n_estimators': [50, 100, 1000],
        'max_features': ['sqrt', 'log2', 'auto'],
        'min_samples_split': [2, 4],
        'min_samples_leaf': [1, 2],
    }
    rf = RandomForestRegressor()

    return random_forest_grid, rf
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def convert(model, feature_names, target):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : RandomForestRegressor
        A trained scikit-learn tree model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _ensemble.RandomForestRegressor)
    def is_rf_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_:
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False
    _sklearn_util.check_fitted(model, is_rf_model)
    return _MLModel(_convert_tree_ensemble(model, feature_names, target))
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston
        from sklearn.ensemble import RandomForestRegressor

        scikit_data = load_boston()
        scikit_model = RandomForestRegressor(random_state = 1)
        scikit_model.fit(scikit_data['data'], scikit_data['target'])

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model