Python sklearn.linear_model 模块,SGDClassifier() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.linear_model.SGDClassifier()

项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_feature_importance(self,clf, model_name ):
        clfs = {'RandomForestClassifier':'feature_importances',
                'ExtraTreesClassifier': 'feature_importances',
                'AdaBoostClassifier': 'feature_importances',
                'LogisticRegression': 'coef',
                'svm.SVC': 'coef',
                'GradientBoostingClassifier': 'feature_importances',
                'GaussianNB': None,
                'DecisionTreeClassifier': 'feature_importances',
                'SGDClassifier': 'coef',
                'KNeighborsClassifier': None,
                'linear.SVC': 'coef'}

        if clfs[model_name] == 'feature_importances':
            return  list(clf.feature_importances_)
        elif clfs[model_name] == 'coef':
            return  list(clf.coef_.tolist())
        else:
            return None
项目:pybot    作者:spillai    | 项目源码 | 文件源码
def __init__(self, filename, target_map, classifier='svm'): 

        self.seed_ = 0
        self.filename_ = filename
        self.target_map_ = target_map
        self.target_ids_ = (np.unique(target_map.keys())).astype(np.int32)
        self.epoch_no_ = 0
        self.st_time_ = time.time()

        # Setup classifier
        print('-------------------------------')        
        print('====> Building Classifier, setting class weights') 
        if classifier == 'svm': 
            self.clf_hyparams_ = {'C':[0.01, 0.1, 1.0, 10.0, 100.0], 'class_weight': ['balanced']}
            self.clf_base_ = LinearSVC(random_state=self.seed_)
        elif classifier == 'sgd': 
            self.clf_hyparams_ = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'class_weight':['auto']} # 'loss':['hinge'], 
            self.clf_ = SGDClassifier(loss='log', penalty='l2', shuffle=False, random_state=self.seed_, 
                                      warm_start=True, n_jobs=-1, n_iter=1, verbose=4)
        else: 
            raise Exception('Unknown classifier type %s. Choose from [sgd, svm, gradient-boosting, extra-trees]' 
                            % classifier)
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def __create_classifiers(self):
        classifiers = list()
        classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
                            "name": "sgd"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
                            "name": "knn1"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
                            "name": "knn3"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
                            "name": "knn5"})
        classifiers.append({"func": GaussianNB(),
                            "name": "naive_bayes"})

        # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
        # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
        # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
        return classifiers
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self, model, parameters, n_cores = 0):
        clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
                'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
                'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
                'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
                'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
                'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
                'GaussianNB': GaussianNB(),
                'DecisionTreeClassifier': DecisionTreeClassifier(),
                'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 
                'linear.SVC': svm.LinearSVC() }

        if model not in clfs:
            raise ConfigError("Unsupported model {}".format(model))

        clf = clfs[model]
        clf.set_params(**parameters)
        return clf
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def try_params( n_iterations, params ):

    n_iterations = int( round( n_iterations ))
    print "n_iterations:", n_iterations
    pprint( params )

    if params['scaler']:
        scaler = eval( "{}()".format( params['scaler'] ))
        x_train_ = scaler.fit_transform( data['x_train'].astype( float ))
        x_test_ = scaler.transform( data['x_test'].astype( float ))

        local_data = { 'x_train': x_train_, 'y_train': data['y_train'], 
          'x_test': x_test_, 'y_test': data['y_test'] }
    else:
        local_data = data

    # we need a copy because at the next small round the best params will be re-used
    params_ = dict( params )
    params_.pop( 'scaler' )

    clf = SGD( n_iter = n_iterations, **params_ )

    return train_and_eval_sklearn_classifier( clf, local_data )
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def learn(self, features, labels):
        """ Fits the classifier

        If it's state is empty, the classifier is fitted, if not
        the classifier is partially fitted.
        See sklearn's SGDClassifier fit and partial_fit methods.

        Args:
            features (:obj:`list` of :obj:`list` of :obj:`float`)
            labels (:obj:`list` of :obj:`str`): Labels for each set of features.
                New features are learnt.
        """
        labels = np.ravel(labels)
        self.__learn_labels(labels)
        if len(labels) == 0:
            return

        labels = self.labels.transform(labels)
        if self.feature_length > 0 and hasattr(self.clf, 'partial_fit'):
            # FIXME? check docs, may need to pass class=[...]
            self.clf = self.clf.partial_fit(features, labels)
        else:
            self.clf = self.clf.fit(features, labels)
            self.feature_length = len(features[0])
项目:textar    作者:datosgobar    | 项目源码 | 文件源码
def make_classifier(self, name, ids, labels):
        """Entrenar un clasificador SVM sobre los textos cargados.

        Crea un clasificador que se guarda en el objeto bajo el nombre `name`.

        Args:
            name (str): Nombre para el clasidicador.
            ids (list): Se espera una lista de N ids de textos ya almacenados
                en el TextClassifier.
            labels (list): Se espera una lista de N etiquetas. Una por cada id
                de texto presente en ids.
        Nota:
            Usa el clasificador de `Scikit-learn <http://scikit-learn.org/>`_
        """
        if not all(np.in1d(ids, self.ids)):
            raise ValueError("Hay ids de textos que no se encuentran \
                              almacenados.")
        setattr(self, name, SGDClassifier())
        classifier = getattr(self, name)
        indices = np.searchsorted(self.ids, ids)
        classifier.fit(self.tfidf_mat[indices, :], labels)
项目:searchgrid    作者:jnothman    | 项目源码 | 文件源码
def test_build_param_grid_set_estimator():
    clf1 = SVC()
    clf2 = LogisticRegression()
    clf3 = SVC()
    clf4 = SGDClassifier()
    estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
                                   ('clf', None)]),
                         clf=[set_grid(clf1, kernel=['linear']),
                              clf2,
                              set_grid(clf3, kernel=['poly'], degree=[2, 3]),
                              clf4])
    param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
                  {'clf': [clf3], 'clf__kernel': ['poly'],
                   'clf__degree': [2, 3], 'sel__k': [2, 3]},
                  {'clf': [clf2, clf4], 'sel__k': [2, 3]}]
    assert build_param_grid(estimator) == param_grid
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def get_sgdc(self):
        return Pipeline([
            ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
            ('feat_select', SelectPercentile(percentile=10)),
            ('clf', SGDClassifier(alpha=0.0001,
                                  average=False,
                                  class_weight=None,
                                  epsilon=0.1,
                                  eta0=0.0,
                                  fit_intercept=True,
                                  l1_ratio=0.15,
                                  learning_rate='optimal',
                                  loss='log',
                                  n_iter=10,
                                  n_jobs=1,
                                  penalty='l2',
                                  power_t=0.5,
                                  random_state=None,
                                  shuffle=True,
                                  verbose=0,
                                  warm_start=False))
        ])
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def run(self):
        training_x, training_y, training_ids = self.get_training_data()
        test_x, test_y, test_ids = self.get_test_data()
        clf = self.define_model(self.model_name, self.model_params)
        clf.fit(training_x, training_y)
        res_predict = clf.predict(test_x)
        if (self.model_name == "SGDClassifier" and (clf.loss =="hinge" or clf.loss == "perceptron")) or self.model_name == "linear.SVC":
            res = list(clf.decision_function(test_x))
        else:
            res = list(clf.predict_proba(test_x)[:,1])
        #fp, fn, tp, tn = self.compute_confusion_matrix(res[:,0], test_y)
        result_dictionary = {'training_ids': training_ids, 
                             'predictions_test_y': list(res_predict),
                             'prob_prediction_test_y': res ,
                             'test_y': list(test_y),
                             'test_ids': list(test_ids),
                             'model_name': self.model_name,
                             'model_params': self.model_params,
                             'label': self.label,
                             'feature_columns_used': self.cols_to_use,
                             'config': self.config,
                             'feature_importance': self.get_feature_importance(clf, self.model_name),
                             'columned_used_for_feat_importance': list(training_x.columns.values)}
        return  result_dictionary, clf
项目:molearn    作者:jmread    | 项目源码 | 文件源码
def demo():
    import sys
    sys.path.append( '../core' )
    from tools import make_XOR_dataset

    X,Y = make_XOR_dataset()
    N,L = Y.shape

    from sklearn import linear_model
    h_ = linear_model.SGDClassifier(n_iter=100)
    from CC import RCC
    cc = RCC(h=h_)
    e = Ensemble(n_estimators=10,base_estimator=cc)
    e.fit(X, Y)
    # test it
    print(e.predict(X))
    print("vs")
    print(Y)
项目:SentiCR    作者:senticr    | 项目源码 | 文件源码
def get_classifier(self):
        algo=self.algo

        if algo=="GBT":
            return GradientBoostingClassifier()
        elif algo=="RF":
            return  RandomForestClassifier()
        elif algo=="ADB":
            return AdaBoostClassifier()
        elif algo =="DT":
            return  DecisionTreeClassifier()
        elif algo=="NB":
            return  BernoulliNB()
        elif algo=="SGD":
            return  SGDClassifier()
        elif algo=="SVC":
            return LinearSVC()
        elif algo=="MLPC":
            return MLPClassifier(activation='logistic',  batch_size='auto',
            early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
            learning_rate_init=0.1, max_iter=5000, random_state=1,
            solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)
        return 0
项目:GraphSAGE    作者:williamleif    | 项目源码 | 文件源码
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
    np.random.seed(1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import f1_score
    from sklearn.multioutput import MultiOutputClassifier
    dummy = MultiOutputClassifier(DummyClassifier())
    dummy.fit(train_embeds, train_labels)
    log = MultiOutputClassifier(SGDClassifier(loss="log"), n_jobs=10)
    log.fit(train_embeds, train_labels)

    f1 = 0
    for i in range(test_labels.shape[1]):
        print("F1 score", f1_score(test_labels[:,i], log.predict(test_embeds)[:,i], average="micro"))
    for i in range(test_labels.shape[1]):
        print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro"))
项目:banking-class    作者:eli-goodfriend    | 项目源码 | 文件源码
def test_cat():
    print 'Testing categorization...'
    filein = 'test_lookup.csv'
    fileout = 'test_cat.csv'
    df = pd.read_csv(filein)

    model = linear_model.SGDClassifier(loss='log')

    catData = df[~df.category.isnull()]
    uncatData = df[df.category.isnull()]
    print str(float(len(catData))/float(len(df)) * 100.) + "% of transactions categorized with lookup."

    ts.train_model(catData,model,embeddings,model_type='logreg',new_run=True)
    ts.use_model(uncatData,model,embeddings,0.0,model_type='logreg')

    df = pd.concat([catData, uncatData])
    df.sort_index(inplace=True)

    df.to_csv(fileout,index=False)
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def train_and_pickle_classifier():
    import numpy as np
    from sklearn.linear_model import SGDClassifier

    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    pickle.dump(clf, open(CLF_FILENAME, 'wb'), protocol=4)
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, path, etype, **kwargs):
        super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
        self.basedir = "models/ensemble/"
        self.goldstd = kwargs.get("goldstd")
        self.data = {}
        self.offsets = []
        self.pipeline = Pipeline(
            [
                #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                #('clf', SGDClassifier())
                # ('clf', svm.NuSVC(nu=0.01 ))
                ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                # ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                # ('clf', MultinomialNB())
                # ('clf', GaussianNB())
                #('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
                #('clf', svm.SVC(kernel="linear", C=2))
                #('clf', DummyClassifier(strategy="constant", constant=True))
            ])
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_transform_linear_model():
    for clf in (LogisticRegression(C=0.1),
                LinearSVC(C=0.01, dual=False),
                SGDClassifier(alpha=0.001, n_iter=50, shuffle=True,
                              random_state=0)):
        for thresh in (None, ".09*mean", "1e-5 * median"):
            for func in (np.array, sp.csr_matrix):
                X = func(data)
                clf.set_params(penalty="l1")
                clf.fit(X, y)
                X_new = assert_warns(
                    DeprecationWarning, clf.transform, X, thresh)
                if isinstance(clf, SGDClassifier):
                    assert_true(X_new.shape[1] <= X.shape[1])
                else:
                    assert_less(X_new.shape[1], X.shape[1])
                clf.set_params(penalty="l2")
                clf.fit(X_new, y)
                pred = clf.predict(X_new)
                assert_greater(np.mean(pred == y), 0.7)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_prefit():
    """
    Test all possible combinations of the prefit parameter.
    """
    # Passing a prefit parameter with the selected model
    # and fitting a unfit model with prefit=False should give same results.
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf)
    model.fit(data, y)
    X_transform = model.transform(data)
    clf.fit(data, y)
    model = SelectFromModel(clf, prefit=True)
    assert_array_equal(model.transform(data), X_transform)

    # Check that the model is rewritten if prefit=False and a fitted model is
    # passed
    model = SelectFromModel(clf, prefit=False)
    model.fit(data, y)
    assert_array_equal(model.transform(data), X_transform)

    # Check that prefit=True and calling fit raises a ValueError
    model = SelectFromModel(clf, prefit=True)
    assert_raises(ValueError, model.fit, data, y)
项目:sandbox-learn    作者:pavmav    | 项目源码 | 文件源码
def score_function(field):
    stats = field.get_stats()
    if "Creature" not in stats:
        return 0
    else:
        return stats["Creature"]

# res = modelling.run_simulation(universe, check_stop_function, score_function, verbose=True, times=30)
# print res
# print np.asarray(res).mean()

# random 1000 10 [193, 37, 97, 224, 349, 165, 251, 130, 184, 335]
# SGDClassifier 1000 10 [9, 106, 127, 11, 187, 38, 193, 114, 236, 27]

# random 500 20 [63, 24, 38, 14, 30, 65, 29, 60, 28, 25, 93, 44, 51, 26, 104, 56, 53, 38, 23, 42] mean 45.299999999999997
# SGDClassifier 500 20 [116, 52, 50, 82, 109, 49, 109, 37, 25, 115, 130, 180, 52, 52, 113, 46, 34, 135, 26, 33] mean 77.25

# random 500 20 [71, 24, 57, 56, 34, 14, 75, 66, 41, 56, 29, 69, 30, 72, 40, 57, 49, 24, 41, 48] mean 47.65
# SGDClassifier 500 20 [175, 40, 117, 96, 119, 116, 58, 134, 67, 87, 73, 147, 124, 125, 82, 139, 78, 110, 74, 100] mean 103.05

# random 500 30 [42, 32, 62, 34, 30, 44, 51, 35, 63, 59, 50, 40, 75, 59, 50, 33, 45, 95, 82, 41, 43, 89, 94, 66, 64, 46, 34, 82, 66, 76]
# 56.0666666667
# SGDClassifier 500 30 [62, 85, 72, 42, 17, 48, 74, 53, 42, 73, 57, 29, 82, 51, 80, 84, 86, 73, 51, 36, 85, 85, 46, 59, 68, 33, 44, 38, 62, 26]
# 58.1
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def compute_sgd(data):
    logging.info('Computing SGD')

    n_splits = 10
    folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
    for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
                                             total=n_splits):
        # {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
        model = SGDClassifier(
            loss='log',
            penalty='elasticnet',
            fit_intercept=True,
            n_iter=100,
            shuffle=True,
            n_jobs=-1,
            l1_ratio=0.0001,
            alpha=1e-05,
            class_weight=None)
        model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
        data['y_train_pred'][ix_second] = logit(model.predict_proba(data['X_train'][ix_second, :])[:, 1])
        data['y_test_pred'].append(logit(model.predict_proba(data['X_test'])[:, 1]))

    data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)

    return data
项目:code-uai16    作者:thanhan    | 项目源码 | 文件源码
def classify(n = 50):
    #clf = MultinomialNB(fit_prior=False)
    #clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0})
    clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0})

    clf.fit(mat[:n], rel[:n])
    return clf
项目:berlin-devfest-2016-backend    作者:giansegato    | 项目源码 | 文件源码
def initialModeling(data):
    X, y = processData(data)

    global n
    n = X.shape[1]

    print "I'm training the model using ", X.shape[0], " samples and ", n, " features.\n"

    global model
    model = SGDClassifier(loss="log", alpha=100, verbose=1)
    model.fit(X, y)

# 6th: update model
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def __init__(self, classifier=None):
        if classifier:
            self.clf = classifier
        else:
            self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500)
        self.labels = preprocessing.LabelEncoder()
        self.feature_length = -1
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def predict(self, features, verbose=False):
        """ Probability estimates of each feature

        See sklearn's SGDClassifier predict and predict_proba methods.

        Args:
            features (:obj:`list` of :obj:`list` of :obj:`float`)
            verbose: Boolean, optional. If true returns an array where each
                element is a dictionary, where keys are labels and values are
                the respective probabilities. Defaults to False.
        Returns:
            Array of array of numbers, or array of dictionaries if verbose i
            True
        """
        probs = self.clf.predict_proba(features)
        if verbose:
            labels = self.labels.classes_
            res = []
            for prob in probs:
                vals = {}
                for i, val in enumerate(prob):
                    label = labels[i]
                    vals[label] = val
                res.append(vals)
            return res
        else:
            return probs
项目:EmotiW-2017-Audio-video-Emotion-Recognition    作者:xujinchang    | 项目源码 | 文件源码
def do_l2norm(X_data):
    x_normalized=preprocessing.normalize(X_data,norm='l2')
    return x_normalized

#svm = SGDClassifier(loss = 'hinge')
#https://ljalphabeta.gitbooks.io/python-/content/kernelsvm.html
项目:EmotiW-2017-Audio-video-Emotion-Recognition    作者:xujinchang    | 项目源码 | 文件源码
def use_SGD(X_data,y_data):
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X_data, y_data)
    return clf

# def use_KNN(X_data,y_data):





# def use_RandomForest(X_data,y_data):
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_basic(self, single_chunk_classification):
        X, y = single_chunk_classification

        a = lm.PartialSGDClassifier(classes=[0, 1], random_state=0,
                                    max_iter=1000, tol=1e-3)
        b = lm_.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3)

        a.fit(X, y)
        b.partial_fit(X, y, classes=[0, 1])
        assert_estimator_equal(a, b, exclude='loss_function_')
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init_no_file():
    mm = sgdc_modelmanager.SGDCModelManager()
    assert isinstance(mm, sgdc_modelmanager.SGDCModelManager)
    assert isinstance(mm.clf, Pipeline)
    assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init():
    mm = sgdc_modelmanager.SGDCModelManager('sgdcmodel.pickle')
    assert isinstance(mm, modelmanager.ModelManager)
    assert isinstance(mm.clf, Pipeline)
    assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init():
    mm = mnb_modelmanager.MNBModelManager('sgdcmodel.pickle')
    assert isinstance(mm, modelmanager.ModelManager)
    assert isinstance(mm.clf, Pipeline)
    assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init():
    ct = classifytext.ClassifyText()
    assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager)
    assert isinstance(ct.mm.clf, Pipeline)
    assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init_sgdc():
    ct = classifytext.ClassifyText(type=classifytext.SGDC)
    assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager)
    assert isinstance(ct.mm.clf, Pipeline)
    assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
项目:fastxml    作者:Refefer    | 项目源码 | 文件源码
def train_clf(self, X, idxss, rs):
        N = sum(len(idx) for idx in idxss)
        n_epochs = self.compute_epochs(N)

        if self.optimization == 'fastxml':
            penalty = 'l1'
        else:
            penalty = 'l2'

        X_train, y_train = self.build_XY(X, idxss, rs)

        in_liblinear = X_train.shape[0] > (self.auto_weight * self.max_leaf_size)
        if self.engine == 'liblinear' or (self.engine == 'auto' and in_liblinear):
            if self.loss == 'log':
                # No control over penalty
                clf = LogisticRegression(solver='liblinear', random_state=rs, tol=1, 
                        C=self.C, penalty=penalty)
            else:
                clf = LinearSVC(C=self.C, fit_intercept=self.bias, 
                        max_iter=n_epochs, class_weight='balanced', 
                        penalty=penalty, random_state=rs)

        else:
            clf = SGDClassifier(loss=self.loss, penalty=penalty, n_iter=n_epochs, 
                    alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced',
                    random_state=rs)

        clf.fit(X_train, y_train)

        # Halves the memory requirement
        clf.coef_ = sparsify(clf.coef_, self.eps)
        if self.bias:
            clf.intercept_ = clf.intercept_.astype('float32')

        return clf, CLF(clf.coef_, clf.intercept_)
项目:molearn    作者:jmread    | 项目源码 | 文件源码
def demo():
    import sys
    sys.path.append( '../core' )
    from tools import make_XOR_dataset

    X,Y = make_XOR_dataset()
    N,L = Y.shape

    br = BR(L, linear_model.SGDClassifier(n_iter=100))
    br.fit(X, Y)
    # test it
    print(br.predict(X))
    print("vs")
    print(Y)
项目:molearn    作者:jmread    | 项目源码 | 文件源码
def demo():

    import sys
    sys.path.append( '../core' )
    from tools import make_XOR_dataset
    from BR import BR
    set_printoptions(precision=3, suppress=True)

    X,Y = make_XOR_dataset()
    N,L = Y.shape

    print("CLASSIFICATION")
    h = linear_model.SGDClassifier(n_iter=100)
    nn = ELM(8,f=tanh,h=BR(-1,h))
    nn.fit(X, Y)
    # test it
    print(nn.predict(X))
    print("vs")
    print(Y)

    print("REGRESSION")
    r = ELM(100,h=linear_model.LinearRegression())
    r.fit(X,Y)
    print(Y)
    print(r.predict(X))

    print("REGRESSION OI")
    r = ELM_OI(100,h=BR(-1,h=linear_model.SGDRegressor()))
    r.fit(X,Y)
    print(Y)
    print(r.predict(X))
项目:molearn    作者:jmread    | 项目源码 | 文件源码
def demo():
    import sys
    from molearn.core.tools import make_XOR_dataset

    X,Y = make_XOR_dataset()
    N,L = Y.shape

    print(Y)
    print("vs")

    print("RCC")
    cc = RCC(SGDClassifier(n_iter=100,loss='log'))
    cc.fit(X, Y)
    print(cc.predict(X))

    print("MCC")
    mcc = MCC(SGDClassifier(n_iter=100,loss='log'),M=1000)
    mcc.fit(X, Y)
    Yp = mcc.predict(X, M=50)
    print("with 50 iterations ...")
    print(Yp)
    Yp = mcc.predict(X, 'default')
    print("with default (%d) iterations ..." % 1000)
    print(Yp)

    print("PCC")
    pcc = PCC(SGDClassifier(n_iter=100,loss='log'))
    pcc.fit(X, Y)
    print(pcc.predict(X))
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:ml_defense    作者:arjunbhagoji    | 项目源码 | 文件源码
def model_trainer(model_dict, X_train, y_train, adv=None, rd=None, rev=None):
    """Trains and returns SVM. Also save SVM to file."""

    print('Training model...')
    start_time = time.time()
    abs_path_m = resolve_path_m(model_dict)
    svm_model = model_dict['svm_type']
    C = model_dict['penconst']
    penalty = model_dict['penalty']
    if adv is None:
        adv_mag = None

    # Create model based on parameters
    if svm_model == 'linear':
        dual = True
        if penalty == 'l1':
            dual = False
        clf = svm.LinearSVC(C=C, penalty=penalty, dual=dual)
        # clf = linear_model.SGDClassifier(alpha=C,l1_ratio=0)
    elif svm_model != 'linear':
        clf = svm.SVC(C=C, kernel=svm_model)

    # Train model
    clf.fit(X_train, y_train)
    print('Finish training in {:d}s'.format(int(time.time() - start_time)))

    # Save model
    joblib.dump(clf, abs_path_m +
                get_svm_model_name(model_dict, rd, rev) + '.pkl')
    return clf
#------------------------------------------------------------------------------#
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def __init__(self,name,kwargs):
        from sklearn.linear_model import SGDClassifier
        super(GCSGDClassifier,self).__init__(name,SGDClassifier,kwargs)
项目:feedlark    作者:CPSSD    | 项目源码 | 文件源码
def get_model_score(training, validation):
    model = linear_model.SGDClassifier(loss='log', n_iter=5)
    model.fit(get_input_data(training), get_output_data(training))
    curr_score = model.score(get_input_data(validation), get_output_data(validation))
    return curr_score
项目:feedlark    作者:CPSSD    | 项目源码 | 文件源码
def __init__(self):
        # loss="log" makes it use logistic regression
        self.model = linear_model.SGDClassifier(loss="log", n_iter=5)
项目:Movie-Success-Predictor    作者:Blueteak    | 项目源码 | 文件源码
def main():
    #before_release
    movie_info_before_release = load_movie_info_before_release()
    print '***Before release***'

    X = create_input(movie_info_before_release)
    Y = create_output_before_release(movie_info_before_release)

    clf = linear_model.SGDClassifier(loss='log')
    test_classifier(clf, X, Y, 'before_release')

    clf = GaussianNB()
    test_classifier(clf, X, Y, 'before_release')

    clf = RandomForestClassifier(n_estimators=10, max_depth=10)
    test_classifier(clf, X, Y, 'before_release')

    #After release
    movie_info = load_movie_info()
    print '***After release***' 

    X = create_input(movie_info)
    Y = create_output(movie_info)

    clf = linear_model.SGDClassifier(loss='log')
    test_classifier(clf, X, Y, 'after_release')

    clf = GaussianNB()
    test_classifier(clf, X, Y, 'after_release')

    clf = RandomForestClassifier(n_estimators=10, max_depth=10)
    test_classifier(clf, X, Y, 'after_release')
项目:GraphSAGE    作者:williamleif    | 项目源码 | 文件源码
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
    np.random.seed(1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import f1_score
    dummy = DummyClassifier()
    dummy.fit(train_embeds, train_labels)
    log = SGDClassifier(loss="log", n_jobs=10)
    log.fit(train_embeds, train_labels)
    print("F1 score:", f1_score(test_labels, log.predict(test_embeds), average="micro"))
    print("Random baseline f1 score:", f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
项目:GraphSAGE    作者:williamleif    | 项目源码 | 文件源码
def run_regression(train_embeds, train_labels, test_embeds, test_labels):
    np.random.seed(1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import f1_score
    dummy = DummyClassifier()
    dummy.fit(train_embeds, train_labels)
    log = SGDClassifier(loss="log", n_jobs=55)
    log.fit(train_embeds, train_labels)
    print("Test scores")
    print(f1_score(test_labels, log.predict(test_embeds), average="micro"))
    print("Train scores")
    print(f1_score(train_labels, log.predict(train_embeds), average="micro"))
    print("Random baseline")
    print(f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
项目:FLASH    作者:yuyuz    | 项目源码 | 文件源码
def get_data_preprocessor_balancing(params, y):
    d_balancing = params['layer_dict_list'][1]

    if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None':
        # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
        params['class_weight'] = None
        # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier']
        params['sample_weight'] = None
    elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting':
        # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier']
        params['class_weight'] = 'auto'
        # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier']
        if len(y.shape) > 1:
            offsets = [2 ** i for i in range(y.shape[1])]
            y_ = np.sum(y * offsets, axis=1)
        else:
            y_ = y
        unique, counts = np.unique(y_, return_counts=True)
        cw = 1. / counts
        cw = cw / np.mean(cw)
        sample_weight = np.ones(y_.shape)
        for i, ue in enumerate(unique):
            mask = y_ == ue
            sample_weight[mask] *= cw[i]
        params['sample_weight'] = sample_weight

    return params
项目:banking-class    作者:eli-goodfriend    | 项目源码 | 文件源码
def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True,
            model_type='logreg',C=10.0,
            alpha=1.0, cutoff=0.50, n_iter=1):
    # pull relevant data and run parsing and classification
    df = pd.read_csv(filename) 
    if (len(df.columns)==2): # make sure columns have the right names
        df.columns = ['raw','amount']

    if new_run: # initialize the model;
        if model_type=='logreg':
            model = linear_model.SGDClassifier(loss='log',warm_start=True,
                                           n_iter=n_iter,alpha=alpha)
        elif model_type=='passive-aggressive':
            model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True)
        elif model_type=='naive-bayes':
            model = naive_bayes.GaussianNB()
        else:
            raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes')
    else: # load a saved, pre-trained model
        modelFileLoad = open(modelname, 'rb')
        model = pickle.load(modelFileLoad)

    fileCities = dirs.data_dir + 'cities_by_state.pickle'
    us_cities = pd.read_pickle(fileCities)

    df = cat_df(df,model,us_cities,embeddings,new_run,run_parse,cutoff=cutoff,
                model_type=model_type)

    df.to_csv(fileout,index=False)

    # Saving logistic regression model from training set 1
    modelFileSave = open(modelname, 'wb')
    pickle.dump(model, modelFileSave)
    modelFileSave.close()


# ------ testing functions
项目:nba-games    作者:ixarchakos    | 项目源码 | 文件源码
def model_fitting(train_set, train_labels, classifier_name, n_jobs=cpu_count()):
    """
    The fitting process with sklearn algorithms.
    :param train_set: numpy array, required
    :param train_labels: list, required
    :param classifier_name: string, required
    :param n_jobs: integer, required
    :return: object
        - Fit classifier model according to the given training data
    """
    classifier_list = {"svm_linear": SVC(probability=True, kernel='linear', C=1.0),
                       "svm_poly": SVC(probability=True, kernel='poly', C=1.0),
                       "svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01),
                       "linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True,
                                               intercept_scaling=1, random_state=None, max_iter=3000),
                       "knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs),
                       "random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2,
                                                                min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs),
                       "logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1,
                                                                 random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr',
                                                                 warm_start=False, n_jobs=n_jobs),
                       "decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
                                                                min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None,
                                                                random_state=None, max_leaf_nodes=None, presort=False),
                       "sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs),
                       "neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12),
                                                            Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200,
                                                    batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True),
                       "GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1),
                       "XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
                                            max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0,
                                            objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)}
    return classifier_list[classifier_name].fit(train_set, train_labels)
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:quantulum    作者:marcolagi    | 项目源码 | 文件源码
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)):
    """Train the intent classifier."""
    if download:
        download_wiki()

    path = os.path.join(l.TOPDIR, 'train.json')
    training_set = json.load(open(path))
    path = os.path.join(l.TOPDIR, 'wiki.json')
    wiki_set = json.load(open(path))

    target_names = list(set([i['unit'] for i in training_set + wiki_set]))
    train_data, train_target = [], []
    for example in training_set + wiki_set:
        train_data.append(clean_text(example['text']))
        train_target.append(target_names.index(example['unit']))

    tfidf_model = TfidfVectorizer(sublinear_tf=True,
                                  ngram_range=ngram_range,
                                  stop_words='english')

    matrix = tfidf_model.fit_transform(train_data)

    if parameters is None:
        parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50,
                      'alpha': 0.00001, 'fit_intercept': True}

    clf = SGDClassifier(**parameters).fit(matrix, train_target)
    obj = {'tfidf_model': tfidf_model,
           'clf': clf,
           'target_names': target_names}
    path = os.path.join(l.TOPDIR, 'clf.pickle')
    pickle.dump(obj, open(path, 'w'))


###############################################################################