Python sklearn.ensemble 模块,GradientBoostingRegressor() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.GradientBoostingRegressor()

项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def cross_validate_best_known():
    '''
        import and clean the tractor data, then do a corss validation on each of the three models we are
        training here. A RandomForest, a GradientBoost, and an AdaBoost backed by a DecisionTree. Print
        the scores.

        The parameters we're using here are the "best" that we've found so far using a grid search.
    '''
    tractor_data = pd.read_csv('data/train.csv')
    tractor_data = cln.clean_all(tractor_data)
    X = tractor_data
    y = tractor_data.pop('SalePrice')

    rf = RandomForestRegressor(max_features=2, min_samples_split=4, n_estimators=50, min_samples_leaf=2)
    gb = GradientBoostingRegressor(loss='quantile', learning_rate=0.0001, n_estimators=50, max_features='log2', min_samples_split=2, max_depth=1)
    ada_tree_backing = DecisionTreeRegressor(max_features='sqrt', splitter='random', min_samples_split=4, max_depth=3)
    ab = AdaBoostRegressor(ada_tree_backing, learning_rate=0.1, loss='square', n_estimators=1000)

    validate.cross_v_scores([rf, gb, ab], X, y)
    # RandomForestRegressor -- RMLSE: -0.596797712098, R2: 0.0272065373946
    # GradientBoostingRegressor -- RMLSE: -0.996134592541, R2: -2.37202164829
    # AdaBoostRegressor -- RMLSE: -0.706385708459, R2: -0.103966980393
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain):

    X_train = Xtrain
    y_train = ytrain 
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {
        'n_estimators': [800,1500],
        'max_features': [20,15],
        'max_depth': [8,10],
        'learning_rate': [0.1],
       'subsample': [1]
    }
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Gradient boosted tree regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_


# read data, build model and do prediction
项目:time_series_modeling    作者:rheineke    | 项目源码 | 文件源码
def unscaled_pipelines():
    # Random forest parameters
    random_forest_kwargs = {
        'n_estimators': 10,
        'criterion': 'mse',
        'random_state': _RANDOM_STATE,
        'n_jobs': cpu_count(),
        'verbose': True,
    }
    # Gradient boosting parameters
    gradient_boost_kwargs = {
        'random_state': _RANDOM_STATE,
        'verbose': 1,
    }
    models = [
        DecisionTreeRegressor(max_depth=3, random_state=_RANDOM_STATE),
        # RandomForestRegressor(**random_forest_kwargs),
        # GradientBoostingRegressor(**gradient_boost_kwargs),
    ]
    pipelines = []
    for m in models:
        # Steps
        pipelines.append(make_pipeline(m))
    return pipelines
项目:mlens    作者:flennerhag    | 项目源码 | 文件源码
def build_ensemble(**kwargs):
    """Generate ensemble."""

    ens = SuperLearner(**kwargs)
    prep = {'Standard Scaling': [StandardScaler()],
            'Min Max Scaling': [MinMaxScaler()],
            'No Preprocessing': []}

    est = {'Standard Scaling':
               [ElasticNet(), Lasso(), KNeighborsRegressor()],
           'Min Max Scaling':
               [SVR()],
           'No Preprocessing':
               [RandomForestRegressor(random_state=SEED),
                GradientBoostingRegressor()]}

    ens.add(est, prep)

    ens.add(GradientBoostingRegressor(), meta=True)

    return ens
项目:scikit-optimize    作者:scikit-optimize    | 项目源码 | 文件源码
def test_gbrt_base_estimator():
    rng = np.random.RandomState(1)
    N = 10000
    X = np.ones((N, 1))
    y = rng.normal(size=N)

    base = RandomForestRegressor()
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    assert_raise_message(ValueError, 'type GradientBoostingRegressor',
                         rgr.fit, X, y)

    base = GradientBoostingRegressor()
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    assert_raise_message(ValueError, 'quantile loss', rgr.fit, X, y)

    base = GradientBoostingRegressor(loss='quantile', n_estimators=20)
    rgr = GradientBoostingQuantileRegressor(base_estimator=base)
    rgr.fit(X, y)

    estimates = rgr.predict(X, return_quantiles=True)
    assert_almost_equal(stats.norm.ppf(rgr.quantiles),
                        np.mean(estimates, axis=0),
                        decimal=2)
项目:datasciences    作者:BenChehade    | 项目源码 | 文件源码
def fs_boruta(df):
    # do feature selection using boruta
    X = df[[x for x in df.columns if x!='SalePrice']]
    y = df['SalePrice']
    model = GradientBoostingRegressor()
    feat_selector = boruta_py.BorutaPy(model, n_estimators=100, verbose=12)

    # find all relevant features
    feat_selector.fit_transform(X.as_matrix(), y.as_matrix())

    # check selected features
    features_bool = np.array(feat_selector.support_)
    features = np.array(X.columns)
    result = features[features_bool]
    #print(result)

    # check ranking of features
    features_rank = feat_selector.ranking_
    #print(features_rank)
    rank = features_rank[features_bool]
    #print(rank)

    return result
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def model_cross_valid(X,Y):
    seed = 7
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    def bulid_model(model_name):
        model = model_name()
        return model
    scoring = 'neg_mean_squared_error'
    # + random fest boost lstm gbdt

    for model_name in [LinearRegression,ElasticNet]:
    #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]:
        model = bulid_model(model_name)
        results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        print(model_name,results.mean())
项目:nirdizati-runtime    作者:nirdizati    | 项目源码 | 文件源码
def __init__(self, nr_events, case_id_col, encoder_kwargs, cls_kwargs, cls_method="rf"):

        self.case_id_col = case_id_col
        self.nr_events = nr_events

        self.encoder = SequenceEncoder(nr_events=nr_events, case_id_col=case_id_col, **encoder_kwargs)

        if cls_method == "gbm":
            self.cls = GradientBoostingRegressor(**cls_kwargs)
        elif cls_method == "rf":
            self.cls = RandomForestRegressor(**cls_kwargs)
        else:
            print("Classifier method not known")
项目:CryptoBot    作者:AdeelMufti    | 项目源码 | 文件源码
def grid_search(X, y, split, learn=[.01], samples_leaf=[250, 350, 500],
                depth=[10, 15]):
    '''
    Runs a grid search for GBM on split data
    '''
    for l in learn:
        for s in samples_leaf:
            for d in depth:
                model = GradientBoostingRegressor(n_estimators=250,
                                                  learning_rate=l,
                                                  min_samples_leaf=s,
                                                  max_depth=d,
                                                  random_state=42)
                model.fit(X.values[:split], y.values[:split])
                in_score = model.score(X.values[:split], y.values[:split])
                out_score = model.score(X.values[split:], y.values[split:])
                print 'learning_rate: {}, min_samples_leaf: {}, max_depth: {}'.\
                    format(l, s, d)
                print 'in-sample score:', in_score
                print 'out-sample score:', out_score
                print ''
项目:pyGPGO    作者:hawk31    | 项目源码 | 文件源码
def __init__(self, q1=.16, q2=.84,**params):
        """
        Gradient boosted trees as surrogate model for Bayesian Optimization.
        Uses quantile regression for an estimate of the 'posterior' variance.
        In practice, the std is computed as (`q2` - `q1`) / 2.
        Relies on `sklearn.ensemble.GradientBoostingRegressor`

        Parameters
        ----------
        q1: float
            First quantile.
        q2: float
            Second quantile
        params: tuple
            Extra parameters to pass to `GradientBoostingRegressor`

        """
        self.params = params
        self.q1 = q1
        self.q2 = q2
        self.eps = 1e-1
项目:pyGPGO    作者:hawk31    | 项目源码 | 文件源码
def fit(self, X, y):
        """
        Fit a GBM model to data `X` and targets `y`.

        Parameters
        ----------
        X : array-like
            Input values.
        y: array-like
            Target values.
        """
        self.X = X
        self.y = y
        self.n = self.X.shape[0]
        self.modq1 = GradientBoostingRegressor(loss='quantile', alpha=self.q1, **self.params)
        self.modq2 = GradientBoostingRegressor(loss='quantile', alpha=self.q2, **self.params)
        self.mod = GradientBoostingRegressor(loss = 'ls', **self.params)
        self.modq1.fit(self.X, self.y)
        self.modq2.fit(self.X, self.y)
        self.mod.fit(self.X, self.y)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_boston_OHE_plus_trees(self): 

        data = load_boston()

        pl = Pipeline([
            ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), 
            ("Trees",GradientBoostingRegressor(random_state = 1))])

        pl.fit(data.data, data.target)

        # Convert the model
        spec = convert(pl, data.feature_names, 'target')

        # Get predictions
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df['prediction'] = pl.predict(data.data)

        # Evaluate it
        result = evaluate_regressor(spec, df, 'target', verbose = False)

        assert result["max_error"] < 0.0001
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def gbdt_select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [100, 120], 'max_depth':[4, 5, 6]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def test():
    iris = load_iris()  
    #print iris
    #print iris['target'].shape  
    gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4) 
    gbdt.fit(iris.data[:120],iris.target[:120])

    #Save GBDT Model
    joblib.dump(gbdt, 'GBDT.model') 

    predict = gbdt.predict(iris.data[:120])
    total_err = 0
    for i in range(len(predict)):
        print predict[i],iris.target[i]
        err = predict[i] - iris.target[i]
        total_err += err * err
    print 'Training Error: %f' % (total_err / len(predict))

    pred = gbdt.predict(iris.data[120:])
    error = 0
    for i in range(len(pred)):
        print pred[i],iris.target[i+120]
        err = pred[i] - iris.target[i+120]
        error += err * err
    print 'Test Error: %f' % (error / len(pred))
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def select_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 16
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [10000, 12000], 'max_depth':[16,15, 14]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_GBDT_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 18
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Gradient Boosting Regression Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced')
    gbdt.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time - start_time).seconds

    print 'Save Model...'
    joblib.dump(gbdt, 'GBDT.model')
    return gbdt
项目:House-Pricing    作者:playing-kaggle    | 项目源码 | 文件源码
def GDBT_regression(X=train_df_munged,Y=label_df['SalePrice']):
    est = GradientBoostingRegressor(n_estimators=50,max_depth=3,learning_rate=0.1)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    est.fit(X_train,Y_train)
    y_train_pred = est.predict(X_test)
    plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    print('rmse value:',rmse(Y_test,y_train_pred))

    return est
项目:Semantic-Texual-Similarity-Toolkits    作者:rgtjf    | 项目源码 | 文件源码
def train_model(self, train_file_path, model_path):
        print("==> Load the data ...")
        X_train, Y_train = self.load_file(train_file_path)
        print(train_file_path, shape(X_train))

        print("==> Train the model ...")
        min_max_scaler = preprocessing.MaxAbsScaler()
        X_train_minmax = min_max_scaler.fit_transform(X_train)

        clf = GradientBoostingRegressor(n_estimators=self.n_estimators)
        clf.fit(X_train_minmax.toarray(), Y_train)

        print("==> Save the model ...")
        pickle.dump(clf, open(model_path, 'wb'))

        scaler_path = model_path.replace('.pkl', '.scaler.pkl')
        pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
        return clf
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_GradientBoostingRegressor_num(*data):
    '''
    test the performance with different n_estimators
    :param data:  train_data, test_data, train_value, test_value
    :return:   None
    '''
    X_train,X_test,y_train,y_test=data
    nums=np.arange(1,200,step=2)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for num in nums:
        regr=ensemble.GradientBoostingRegressor(n_estimators=num)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(nums,training_scores,label="Training Score")
    ax.plot(nums,testing_scores,label="Testing Score")
    ax.set_xlabel("estimator num")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(0,1.05)
    plt.suptitle("GradientBoostingRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_GradientBoostingRegressor_maxdepth(*data):
    '''
    test the performance with different max_depth
    :param data:   train_data, test_data, train_value, test_value
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    maxdepths=np.arange(1,20)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for maxdepth in maxdepths:
        regr=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(maxdepths,training_scores,label="Training Score")
    ax.plot(maxdepths,testing_scores,label="Testing Score")
    ax.set_xlabel("max_depth")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(-1,1.05)
    plt.suptitle("GradientBoostingRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_GradientBoostingRegressor_learning(*data):
    '''
    test the performance with different learning rate
    :param data:   train_data, test_data, train_value, test_value
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    learnings=np.linspace(0.01,1.0)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    testing_scores=[]
    training_scores=[]
    for learning in learnings:
        regr=ensemble.GradientBoostingRegressor(learning_rate=learning)
        regr.fit(X_train,y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(learnings,training_scores,label="Training Score")
    ax.plot(learnings,testing_scores,label="Testing Score")
    ax.set_xlabel("learning_rate")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(-1,1.05)
    plt.suptitle("GradientBoostingRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_GradientBoostingRegressor_subsample(*data):
    '''
    test the performance with different subsample
    :param data:    train_data, test_data, train_value, test_value
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    subsamples=np.linspace(0.01,1.0,num=20)
    testing_scores=[]
    training_scores=[]
    for subsample in subsamples:
            regr=ensemble.GradientBoostingRegressor(subsample=subsample)
            regr.fit(X_train,y_train)
            training_scores.append(regr.score(X_train,y_train))
            testing_scores.append(regr.score(X_test,y_test))
    ax.plot(subsamples,training_scores,label="Training Score")
    ax.plot(subsamples,testing_scores,label="Training Score")
    ax.set_xlabel("subsample")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(-1,1.05)
    plt.suptitle("GradientBoostingRegressor")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_GradientBoostingRegressor_max_features(*data):
    '''
    test the performance with different max_features
    :param data:  train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    max_features=np.linspace(0.01,1.0)
    testing_scores=[]
    training_scores=[]
    for features in max_features:
            regr=ensemble.GradientBoostingRegressor(max_features=features)
            regr.fit(X_train,y_train)
            training_scores.append(regr.score(X_train,y_train))
            testing_scores.append(regr.score(X_test,y_test))
    ax.plot(max_features,training_scores,label="Training Score")
    ax.plot(max_features,testing_scores,label="Training Score")
    ax.set_xlabel("max_features")
    ax.set_ylabel("score")
    ax.legend(loc="lower right")
    ax.set_ylim(0,1.05)
    plt.suptitle("GradientBoostingRegressor")
    plt.show()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_feature_importances():
    X = np.array(boston.data, dtype=np.float32)
    y = np.array(boston.target, dtype=np.float32)

    for presort in True, False:
        clf = GradientBoostingRegressor(n_estimators=100, max_depth=5,
                                        min_samples_split=2, random_state=1,
                                        presort=presort)
        clf.fit(X, y)
        assert_true(hasattr(clf, 'feature_importances_'))

        # XXX: Remove this test in 0.19 after transform support to estimators
        # is removed.
        X_new = assert_warns(
            DeprecationWarning, clf.transform, X, threshold="mean")
        assert_less(X_new.shape[1], X.shape[1])
        feature_mask = (
            clf.feature_importances_ > clf.feature_importances_.mean())
        assert_array_almost_equal(X_new, X[:, feature_mask])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_staged_predict():
    # Test whether staged decision function eventually gives
    # the same prediction.
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=1, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test = X[200:]
    clf = GradientBoostingRegressor()
    # test raise ValueError if not fitted
    assert_raises(ValueError, lambda X: np.fromiter(
        clf.staged_predict(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # test if prediction for last stage equals ``predict``
    for y in clf.staged_predict(X_test):
        assert_equal(y.shape, y_pred.shape)

    assert_array_equal(y_pred, y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_staged_functions_defensive():
    # test that staged_functions make defensive copies
    rng = np.random.RandomState(0)
    X = rng.uniform(size=(10, 3))
    y = (4 * X[:, 0]).astype(np.int) + 1  # don't predict zeros
    for estimator in [GradientBoostingRegressor(),
                      GradientBoostingClassifier()]:
        estimator.fit(X, y)
        for func in ['predict', 'decision_function', 'predict_proba']:
            staged_func = getattr(estimator, "staged_" + func, None)
            if staged_func is None:
                # regressor has no staged_predict_proba
                continue
            with warnings.catch_warnings(record=True):
                staged_result = list(staged_func(X))
            staged_result[1][:] = 0
            assert_true(np.all(staged_result[0] != 0))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_warm_start_oob():
    # Test if warm start OOB equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
                  random_state=1)
        est.fit(X, y)

        est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
                     random_state=1, warm_start=True)
        est_ws.fit(X, y)
        est_ws.set_params(n_estimators=200)
        est_ws.fit(X, y)

        assert_array_almost_equal(est_ws.oob_improvement_[:100],
                                  est.oob_improvement_[:100])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_multi_target_sample_weights():
    # weighted regressor
    Xw = [[1,2,3], [4,5,6]]
    yw = [[3.141, 2.718], [2.718, 3.141]]
    w = [2., 1.]
    rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1,2,3], [1,2,3], [4,5,6]]
    y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr.fit(X, y)

    X_test = [[1.5,2.5,3.5], [3.5,4.5,5.5]]
    assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def __init__(self, info, verbose=True, debug_mode=False):
        self.label_num=info['label_num']
        self.target_num=info['target_num']
        self.task = info['task']
        self.metric = info['metric']
        self.postprocessor = None
        #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
        self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
        if debug_mode>=2:
            self.name = "RandomPredictor"
            self.model = RandomPredictor(self.target_num)
            self.predict_method = self.model.predict_proba 
            return
        if info['task']=='regression':
            if info['is_sparse']==True:
                self.name = "BaggingRidgeRegressor"
                self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            else:
                self.name = "GradientBoostingRegressor"
                self.model = GradientBoostingRegressor(n_estimators=1,  max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
            self.predict_method = self.model.predict # Always predict probabilities
        else:
            if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
                self.name = "RandomForestClassifier"
                self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            elif info['is_sparse']:                
                self.name = "BaggingNBClassifier"
                self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...                          
            else:
                self.name = "GradientBoostingClassifier"
                self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
            if info['task']=='multilabel.classification':
                self.model = MultiLabelEnsemble(self.model)
            self.predict_method = self.model.predict_proba
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params ):

    n_estimators = int( round( n_iterations * trees_per_iteration ))
    print "n_estimators:", n_estimators
    pprint( params )

    clf = GB( n_estimators = n_estimators, verbose = 0, **params )

    return train_and_eval_sklearn_regressor( clf, data )
项目:simple_rl    作者:david-abel    | 项目源码 | 文件源码
def add_new_weak_learner(self):
        '''
        Summary:
            Adds a new function, h, to self.weak_learners by solving for Eq. 1 using multiple additive regression trees:

            [Eq. 1] h = argmin_h (sum_i Q_A(s_i,a_i) + h(s_i, a_i) - (r_i + max_b Q_A(s'_i, b)))

        '''
        if len(self.most_recent_episode) == 0:
            # If this episode contains no data, don't do anything.
            return

        # Build up data sets of features and loss terms
        data = np.zeros((len(self.most_recent_episode), self.max_state_features + 1))
        total_loss = np.zeros(len(self.most_recent_episode))

        for i, experience in enumerate(self.most_recent_episode):
            # Grab the experience.
            s, a, r, s_prime = experience

            # Pad in case the state features are too short (as in Atari sometimes).
            features = self._pad_features_with_zeros(s, a)
            loss = (r + self.gamma * self.get_max_q_value(s_prime) - self.get_q_value(s, a))

            # Add to relevant lists.
            data[i] = features
            total_loss[i] = loss

        # Compute new regressor and add it to the weak learners.
        estimator = GradientBoostingRegressor(loss='ls', n_estimators=1, max_depth=self.max_depth)
        estimator.fit(data, total_loss)
        self.weak_learners.append(estimator)
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingRegressor):
            X = X.toarray()

        predictions = estimator.predict(X)

        if took_log_of_y:
            for idx, val in enumerate(predictions):
                predictions[idx] = math.exp(val)

        try:
            score = self.scoring_func(y, predictions)
        except ValueError:

            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
            y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]

            print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
            score = self.scoring_func(y, predictions)

        if advanced_scoring == True:
            if hasattr(estimator, 'name'):
                print(estimator.name)
            advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
        return - 1 * score
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
        self.clf = GradientBoostingRegressor(n_estimators=100, verbose=100)
#         self.clf = GradientBoostingRegressor(loss = 'ls', verbose = 300, n_estimators=70,    learning_rate= 0.1,subsample=1.0, max_features = 1.0)
        return
项目:AutoML4    作者:djajetic    | 项目源码 | 文件源码
def __init__(self, info, verbose=True, debug_mode=False):
        self.label_num=info['label_num']
        self.target_num=info['target_num']
        self.task = info['task']
        self.metric = info['metric']
        self.postprocessor = None
        #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
        self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
        if debug_mode>=2:
            self.name = "RandomPredictor"
            self.model = RandomPredictor(self.target_num)
            self.predict_method = self.model.predict_proba 
            return
        if info['task']=='regression':
            if info['is_sparse']==True:
                self.name = "BaggingRidgeRegressor"
                self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            else:
                self.name = "GradientBoostingRegressor"
                self.model = GradientBoostingRegressor(n_estimators=1,  max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
            self.predict_method = self.model.predict # Always predict probabilities
        else:
            if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
                self.name = "RandomForestClassifier"
                self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            elif info['is_sparse']:                
                self.name = "BaggingNBClassifier"
                self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...                          
            else:
                self.name = "GradientBoostingClassifier"
                self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
            if info['task']=='multilabel.classification':
                self.model = MultiLabelEnsemble(self.model)
            self.predict_method = self.model.predict_proba
项目:fluentopt    作者:mehdidc    | 项目源码 | 文件源码
def run_batch(batch):
        for num_iters, params in batch:
            max_depth = params['max_depth']
            learning_rate = params['learning_rate']
            num_iters = int(num_iters)
            reg = GradientBoostingRegressor(
                learning_rate=learning_rate, 
                max_depth=max_depth, 
                n_estimators=num_iters)
            reg.fit(X_train, y_train)
            mse = ((reg.predict(X_test) - y_test)**2).mean()
            yield mse
项目:datasciences    作者:BenChehade    | 项目源码 | 文件源码
def greedy_elim(df):

    # do feature selection using boruta
    X = df[[x for x in df.columns if x!='SalePrice']]
    y = df['SalePrice']
    #model = RandomForestRegressor(n_estimators=50)
    model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05)
    # 150 features seems to be the best at the moment. Why this is is unclear.
    feat_selector = RFE(estimator=model, step=1, n_features_to_select=150)

    # find all relevant features
    feat_selector.fit_transform(X.as_matrix(), y.as_matrix())

    # check selected features
    features_bool = np.array(feat_selector.support_)
    features = np.array(X.columns)
    result = features[features_bool]
    #print(result)

    # check ranking of features
    features_rank = feat_selector.ranking_
    #print(features_rank)
    rank = features_rank[features_bool]
    #print(rank)

    return result
项目:coremltools    作者:gsabran    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : GradientBoostingRegressor
        A trained scikit-learn tree model.

    input_feature: [str]
        Name of the input columns.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor)
    def is_gbr_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_.flatten():
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False

    _sklearn_util.check_fitted(model, is_gbr_model)

    base_prediction = model.init_.mean

    return _MLModel(_convert_tree_ensemble(model, input_features, output_features,
            base_prediction = base_prediction))
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def model_fit_and_test(TrainX,TrainY,TestX,TestY):
    def bulid_model(model_name):
        model = model_name()
        return model
    #for model_name in [LinearRegression, Ridge, Lasso, ElasticNet, KNeighborsRegressor, DecisionTreeRegressor, SVR,RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor]:
    for model_name in [LinearRegression, ElasticNet]:
        model = bulid_model(model_name)
        model.fit(TrainX,TrainY)
        print(model_name)
        resid = model.predict(TestX) - TestY
        #print resid
        print("Residual sum of squares: %f"% np.mean(resid ** 2))
        #print model.predict(TestX)
        #print TestY
        # Explained variance score: 1 is perfect prediction
        plt.scatter(model.predict(TestX), resid);
        plt.axhline(0, color='red')
        plt.xlabel('Predicted Values')
        plt.ylabel('Residuals')
        #plt.xlim([1, 50])
        plt.show()

        print('Variance score: %.2f' % model.score(TestX, TestY))

        from statsmodels.stats.stattools import jarque_bera
        _, pvalue, _, _ = jarque_bera(resid)
        print ("Test Residuals Normal", pvalue)

        from statsmodels import regression, stats
        import statsmodels.api as sms
        import statsmodels.stats.diagnostic as smd
        # xs_with_constant = sms.add_constant(np.column_stack((X1,X2,X3,X4)))
        xs_with_constant = sms.add_constant(TestX)
        _, pvalue1, _, _ = stats.diagnostic.het_breushpagan(resid, xs_with_constant)
        print ("Test Heteroskedasticity", pvalue1)
        ljung_box = smd.acorr_ljungbox(resid, lags=10)

        #print "Lagrange Multiplier Statistics:", ljung_box[0]
        print "Test Autocorrelation P-values:", ljung_box[1]
        if any(ljung_box[1] < 0.05):
            print "The residuals are autocorrelated."
        else:
            print "The residuals are not autocorrelated."
项目:CryptoBot    作者:AdeelMufti    | 项目源码 | 文件源码
def fit_boosting(X, y, window=100000, estimators=250, learning=.01,
                 samples_leaf=500, depth=20, validate=False):
    '''
    Fits Gradient Boosting
    '''
    model = GradientBoostingRegressor(n_estimators=estimators,
                                      learning_rate=learning,
                                      min_samples_leaf=samples_leaf,
                                      max_depth=depth,
                                      random_state=42)
    if validate:
        return cross_validate(X, y, model, window)
    return model.fit(X, y)
项目:kdd2017    作者:JinpengLI    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        #print("kwargs=", kwargs)
        self.is_boxcox = kwargs.get("is_boxcox", False)
        self.boxcox_lambda = kwargs.get("boxcox_lambda", 0.0)
        self.Model = kwargs.get("model", GradientBoostingRegressor)
        if "is_boxcox" in kwargs:
            kwargs.pop("is_boxcox")
        if "boxcox_lambda" in kwargs:
            kwargs.pop("boxcox_lambda")
        if "model" in kwargs:
            kwargs.pop("model")
        self.clf = self.Model(**kwargs)
项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def gradient_boost_grid_search():
    gradient_boost_grid = {
        'loss': ['ls', 'lad', 'huber', 'quantile'],
        'learning_rate': [.0001, .001, .01, .1, 1],
        'n_estimators': [50, 100, 1000, 10000],
        'max_depth': [1, 3],
        'min_samples_split': [2, 4, 10],
        'max_features': ['sqrt', 'log2'],
    }
    gb = GradientBoostingRegressor()

    return gradient_boost_grid, gb
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : GradientBoostingRegressor
        A trained scikit-learn tree model.

    input_feature: [str]
        Name of the input columns.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor)
    def is_gbr_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_.flatten():
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False

    _sklearn_util.check_fitted(model, is_gbr_model)

    base_prediction = model.init_.mean

    return _MLModel(_convert_tree_ensemble(model, input_features, output_features,
            base_prediction = base_prediction))
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(cls):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not HAS_SKLEARN:
            return

        scikit_data = load_boston()
        scikit_model = GradientBoostingRegressor(random_state = 1)
        scikit_model.fit(scikit_data['data'], scikit_data['target'])

        # Save the data and the model
        cls.scikit_data = scikit_data
        cls.scikit_model = scikit_model
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = GradientBoostingRegressor()
            spec = skl_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = GradientBoostingRegressor()
            spec = xgb_converter.convert(model, 'data', 'out')

        # Check the expected class during conversion
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = xgb_converter.convert(model, 'data', 'out')
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def test_model_select_by_param():
    iris = load_iris()    
    gbdt = GradientBoostingRegressor() 
    parameters = {'n_estimators': [1000, 5000], 'max_depth':[3,4]}
    grid_search = GridSearchCV(estimator=gbdt, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(iris.data[:150],iris.target[:150])
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingRegressor):
            X = X.toarray()

        predictions = estimator.predict(X)

        if took_log_of_y:
            for idx, val in enumerate(predictions):
                predictions[idx] = math.exp(val)

        try:
            score = self.scoring_func(y, predictions)
        except ValueError:

            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings or str(predictions[idx]) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
            y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]

            print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the predicted or y values. We will ignore these, and report the score on the rest of the dataset')
            score = self.scoring_func(y, predictions)

        if advanced_scoring == True:
            if hasattr(estimator, 'name'):
                print(estimator.name)
            advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
        return - 1 * score
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def regression_with_GBR(X_train, y_train, X_test, y_test, parmsFromNormalization, params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}):
        #GradientBoostingRegressor
    gfr = GradientBoostingRegressor(**params)
    gfr.fit(X_train, y_train)
    y_pred_gbr = gfr.predict(X_test)
    print_regression_model_summary("GBR", y_test, y_pred_gbr, parmsFromNormalization)
    print_feature_importance(X_test, y_test,gfr.feature_importances_)

    #cross validation ( not sure this make sense for regression
    #http://scikit-learn.org/stable/modules/cross_validation.html
    #gfr = GradientBoostingRegressor(**params)
    #scores = cross_validation.cross_val_score(gfr, X_train, y_train, cv=5)
    #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return y_pred_gbr
项目:House-Pricing    作者:playing-kaggle    | 项目源码 | 文件源码
def GDBT_regression(X=train_split,Y=y):
    est = GradientBoostingRegressor(n_estimators=75,max_depth=3,learning_rate=0.1)
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
    est.fit(X_train,Y_train)
    y_train_pred = est.predict(X_test)
    plt.scatter(y_train_pred,y_train_pred - Y_test,c = 'blue',marker='s', label='error on training data')

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(Y_test, y_train_pred, c="blue", marker="s", label="Training data")

    plt.title("Linear regression with  GDBT")
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    print('rmse value:',rmsle(Y_test,y_train_pred))

    return est


# linear_regression()
# ridge_regression()
# Lasso_regression()
#model = Elasticnet_regression()
# '''
#         predict final result
#  '''
#
#
# coefs,lasso = Lasso_regression()
# selected_features = coefs[coefs['value'] != 0].index.values
# train_new = train_split[selected_features]
项目:groot    作者:zhpmatrix    | 项目源码 | 文件源码
def gbr(X,y):
    X_train,X_validation,y_train,y_validation = train_test_split(X,y,random_state=0)
    sklearn_boost = GradientBoostingRegressor(random_state=1)
    sklearn_boost.fit(X_train,y_train.ravel())
    print 'training error:',1.0 - sklearn_boost.score(X_train,y_train)
    print 'validation error:',1.0 - sklearn_boost.score(X_validation,y_validation)
    time_fit(sklearn_boost,X_train,y_train.ravel())