Python sklearn.linear_model 模块,LogisticRegression() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.linear_model.LogisticRegression()

项目:triage    作者:dssg    | 项目源码 | 文件源码
def trained_models():
    dataset = datasets.load_breast_cancer()
    X = dataset.data
    y = dataset.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    lr = LogisticRegression()
    lr.fit(X_train, y_train)

    svc_w_linear_kernel = SVC(kernel='linear')
    svc_w_linear_kernel.fit(X_train, y_train)

    svc_wo_linear_kernel = SVC()
    svc_wo_linear_kernel.fit(X_train, y_train)

    dummy = DummyClassifier()
    dummy.fit(X_train, y_train)

    return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel,
            'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_feature_importance(self,clf, model_name ):
        clfs = {'RandomForestClassifier':'feature_importances',
                'ExtraTreesClassifier': 'feature_importances',
                'AdaBoostClassifier': 'feature_importances',
                'LogisticRegression': 'coef',
                'svm.SVC': 'coef',
                'GradientBoostingClassifier': 'feature_importances',
                'GaussianNB': None,
                'DecisionTreeClassifier': 'feature_importances',
                'SGDClassifier': 'coef',
                'KNeighborsClassifier': None,
                'linear.SVC': 'coef'}

        if clfs[model_name] == 'feature_importances':
            return  list(clf.feature_importances_)
        elif clfs[model_name] == 'coef':
            return  list(clf.coef_.tolist())
        else:
            return None
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'penalty': ['l1'],
                             'C': np.logspace(-5,5)},
                             {'penalty': ['l2'],
                              'C': np.logspace(-5,5)}]

        clf = GridSearchCV(linear_model.LogisticRegression(tol=1e-6), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self, model, parameters, n_cores = 0):
        clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
                'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
                'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
                'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
                'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
                'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
                'GaussianNB': GaussianNB(),
                'DecisionTreeClassifier': DecisionTreeClassifier(),
                'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 
                'linear.SVC': svm.LinearSVC() }

        if model not in clfs:
            raise ConfigError("Unsupported model {}".format(model))

        clf = clfs[model]
        clf.set_params(**parameters)
        return clf
项目:visually-grounded-speech    作者:gchrupala    | 项目源码 | 文件源码
def test_homonym(H, sent, features, C=1.0):
    X_0 = features(matching(sent, H[0]))
    X_1 = features(matching(sent, H[1]))
    y_0 = numpy.zeros(len(X_0))
    y_1 = numpy.ones(len(X_1))
    X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
    y = numpy.hstack([y_0, y_1])
    classifier = LogisticRegression(C=C)
    fold = StratifiedKFold(y, n_folds=10)
    score = []
    count = []
    for tr, te in fold:
        X_tr, X_te = X[tr], X[te]
        y_tr, y_te = y[tr], y[te]
        classifier.fit(X_tr, y_tr)
        score.append(sum(classifier.predict(X_te) == y_te))
        count.append(len(y_te))
    score = numpy.array(score, dtype='float')
    count = numpy.array(count, dtype='float')
    result = {'word1_count': len(y_0),
              'word2_count': len(y_1),
              'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
              'kfold_acc': score/count }
    return result
项目:Brain_Tumor_Segmentation    作者:KarthikRevanuru    | 项目源码 | 文件源码
def train_xgboost():
    df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7')
    p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])
    s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train])

    y=np.array([])
    t=0
    z=np.array([])
    for ind in range(len(folder_names_train)):
        try:
            temp = df.get_value(str(folder_names_train[ind]),'Survival')
            y=np.append(y,temp)
            temp = df.get_value(str(folder_names_train[ind]),'Age')
            z=np.append(z,np.array([temp]))
        except Exception as e:
            t+=1 
            print (t,str(e),"Label Not found, deleting entry")
            y=np.append(y,0)

    z=np.array([[v] for v in z])

    t=np.concatenate((p,q),axis=1)
    u=np.concatenate((r,s),axis=1)
    x=np.concatenate((t,u),axis=1) 
    #print(x.shape)
    #print (x)
    #print (x.shape,z.shape)
    x=np.concatenate((x,z),axis=1)
    #print (x)
    #clf=linear_model.LogisticRegression(C=1e5)
    #clf = RandomForestRegressor()
    clf = xgb.XGBRegressor()
    clf.fit(x,y)
    return clf
项目:easyML    作者:aarshayj    | 项目源码 | 文件源码
def __init__(
        self,data_block, predictors=[],cv_folds=10,
        scoring_metric='accuracy',additional_display_metrics=[]):

        base_classification.__init__(
            self, alg=LogisticRegression(), data_block=data_block, 
            predictors=predictors,cv_folds=cv_folds,
            scoring_metric=scoring_metric, 
            additional_display_metrics=additional_display_metrics
            )

        self.model_output=pd.Series(self.default_parameters)
        self.model_output['Coefficients'] = "-"

        #Set parameters to default values:
        self.set_parameters(set_default=True)
项目:OptML    作者:johannespetrat    | 项目源码 | 文件源码
def test_model_detection(self):
        sklearn_model = LogisticRegression()
        pipeline_model = Pipeline([('log', sklearn_model)])
        xgb_model = XGBClassifier()
        nn_model = NNModel(100,10)
        sklearn_opt = Optimizer(sklearn_model,[], lambda x: x)
        pipeline_opt = Optimizer(pipeline_model,[], lambda x: x)
        xgb_opt = Optimizer(xgb_model,[], lambda x: x)
        nn_opt = Optimizer(nn_model,[], lambda x: x)

        self.assertEqual(sklearn_opt.model_module, 'sklearn')
        self.assertEqual(pipeline_opt.model_module, 'pipeline')
        self.assertEqual(xgb_opt.model_module, 'xgboost')
        self.assertEqual(nn_opt.model_module, 'keras')
项目:Twitter-and-IMDB-Sentimental-Analytics    作者:abhinandanramesh    | 项目源码 | 文件源码
def build_models_NLP(train_pos_vec, train_neg_vec):
    """
    Returns a BernoulliNB and LosticRegression Model that are fit to the training data.
    """
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)

    # Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data.
    # For BernoulliNB, use alpha=1.0 and binarize=None
    # For LogisticRegression, pass no parameters
    train_vec = []
    train_vec.extend(train_pos_vec)
    train_vec.extend(train_neg_vec)

    nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True)
    nb_model.fit(train_vec, Y)

    lr_model = LogisticRegression()
    lr_model.fit(train_vec, Y)

    return nb_model, lr_model
项目:Twitter-and-IMDB-Sentimental-Analytics    作者:abhinandanramesh    | 项目源码 | 文件源码
def build_models_DOC(train_pos_vec, train_neg_vec):
    """
    Returns a GaussianNB and LosticRegression Model that are fit to the training data.
    """
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)

    # Use sklearn's GaussianNB and LogisticRegression functions to fit two models to the training data.
    # For LogisticRegression, pass no parameters
    train_vec = []
    train_vec.extend(train_pos_vec)
    train_vec.extend(train_neg_vec)

    nb_model = GaussianNB()
    nb_model.fit(train_vec, Y)

    lr_model = LogisticRegression()
    lr_model.fit(train_vec, Y)

    return nb_model, lr_model
项目:XTREE    作者:ai-se    | 项目源码 | 文件源码
def learns(tests,trains,indep=lambda x: x[:-1],
                    dep = lambda x: x[-1],
                    rf  = Abcd(),
                    lg  = Abcd(),
                    dt  = Abcd(),
                    nb  = Abcd()):
  x1,y1,x2,y2= trainTest(tests,trains,indep,dep) 
  forest = RandomForestClassifier(n_estimators = 50)  
  forest = forest.fit(x1,y1)
  for n,got in enumerate(forest.predict(x2)):
    rf(predicted = got, actual = y2[n])
  logreg = linear_model.LogisticRegression(C=1e5)
  logreg.fit(x1, y1)
  for n,got in enumerate(logreg.predict(x2)):
    lg(predicted = got, actual = y2[n])
  bayes =  GaussianNB()
  bayes.fit(x1,y1)
  for n,got in enumerate(bayes.predict(x2)):
    nb(predicted = got, actual = y2[n])
  dectree = DecisionTreeClassifier(criterion="entropy",
                         random_state=1)
  dectree.fit(x1,y1)
  for n,got in enumerate(dectree.predict(x2)):
    dt(predicted = got, actual = y2[n])
项目:triage    作者:dssg    | 项目源码 | 文件源码
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
项目:SteinGAN    作者:DartML    | 项目源码 | 文件源码
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
    tr_accs = []
    va_accs = []
    models = []
    for C in Cs:
        model = LR(C=C)
        model.fit(trX, trY)
        tr_pred = model.predict(trX)
        va_pred = model.predict(vaX)
        tr_acc = metrics.accuracy_score(trY, tr_pred)
        va_acc = metrics.accuracy_score(vaY, va_pred)
        print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
        tr_accs.append(tr_acc)
        va_accs.append(va_acc)
        models.append(model)
    best = np.argmax(va_accs)
    print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
    return models[best]
项目:MultimodalAutoencoder    作者:natashamjaques    | 项目源码 | 文件源码
def train_and_predict(self, param_dict, predict_on='val'):
        """Initializes a LR classifier according to the desired parameter settings, 
        trains it, and returns the predictions on the appropriate evaluation dataset.

        Args:
            param_dict: A dictionary with keys representing parameter names and 
                values representing settings for those parameters.
            predict_on: The dataset used for evaluating the model. Can set to 
                'Test' to get final results.

        Returns: The predicted Y labels.
        """
        if predict_on == 'test':
            predict_X = self.data_loader.test_X
        else:
            predict_X = self.data_loader.val_X

        self.model = linear_model.LogisticRegression(penalty=param_dict['penalty'], 
                                                C=param_dict['C'])
        self.model.fit(self.data_loader.train_X, self.data_loader.train_Y)
        preds = self.predict_on_data(predict_X)

        return preds
项目:searchgrid    作者:jnothman    | 项目源码 | 文件源码
def test_build_param_grid_set_estimator():
    clf1 = SVC()
    clf2 = LogisticRegression()
    clf3 = SVC()
    clf4 = SGDClassifier()
    estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
                                   ('clf', None)]),
                         clf=[set_grid(clf1, kernel=['linear']),
                              clf2,
                              set_grid(clf3, kernel=['poly'], degree=[2, 3]),
                              clf4])
    param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
                  {'clf': [clf3], 'clf__kernel': ['poly'],
                   'clf__degree': [2, 3], 'sel__k': [2, 3]},
                  {'clf': [clf2, clf4], 'sel__k': [2, 3]}]
    assert build_param_grid(estimator) == param_grid
项目:searchgrid    作者:jnothman    | 项目源码 | 文件源码
def test_make_grid_search():
    X, y = load_iris(return_X_y=True)
    lr = LogisticRegression()
    svc = set_grid(SVC(), kernel=['poly'], degree=[2, 3])
    gs1 = make_grid_search(lr, cv=5)  # empty grid
    gs2 = make_grid_search(svc, cv=5)
    gs3 = make_grid_search([lr, svc], cv=5)
    for gs, n_results in [(gs1, 1), (gs2, 2), (gs3, 3)]:
        gs.fit(X, y)
        assert gs.cv == 5
        assert len(gs.cv_results_['params']) == n_results

    svc_mask = gs3.cv_results_['param_root'] == svc
    assert svc_mask.sum() == 2
    assert gs3.cv_results_['param_root__degree'][svc_mask].tolist() == [2, 3]
    assert gs3.cv_results_['param_root'][~svc_mask].tolist() == [lr]
项目:coremltools    作者:gsabran    | 项目源码 | 文件源码
def convert(model, feature_names, target):
    """Convert a Logistic Regression model to the protobuf spec.
    Parameters
    ----------
    model: LogisticRegression
        A trained LogisticRegression model.

    feature_names: [str], optional (default=None)
        Name of the input columns.

    target: str, optional (default=None)
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, LogisticRegression)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))

    return _MLModel(_convert(model, feature_names, target))
项目:Deep-Learning-with-Theano    作者:PacktPublishing    | 项目源码 | 文件源码
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
    tr_accs = []
    va_accs = []
    models = []
    for C in Cs:
        model = LR(C=C)
        model.fit(trX, trY)
        tr_pred = model.predict(trX)
        va_pred = model.predict(vaX)
        tr_acc = metrics.accuracy_score(trY, tr_pred)
        va_acc = metrics.accuracy_score(vaY, va_pred)
        print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
        tr_accs.append(tr_acc)
        va_accs.append(va_acc)
        models.append(model)
    best = np.argmax(va_accs)
    print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
    return models[best]
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_stacked_classfier_extkfold(self):
        bclf = LogisticRegression(random_state=1)
        clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
                RidgeClassifier(random_state=1),
                ]
        sl = StackedClassifier(bclf,
                               clfs,
                               n_folds=3,
                               verbose=0,
                               Kfold=StratifiedKFold(self.iris.target, 3),
                               stack_by_proba=False,
                               oob_score_flag=True,
                               oob_metrics=log_loss)
        sl.fit(self.iris.data, self.iris.target)
        score = sl.score(self.iris.data, self.iris.target)
        self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_fwls_classfier(self):
        feature_func = lambda x: np.ones(x.shape)
        bclf = LogisticRegression(random_state=1)
        clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
                RidgeClassifier(random_state=1),
                ]
        sl = FWLSClassifier(bclf,
                            clfs,
                            feature_func=feature_func,
                            n_folds=3,
                            verbose=0,
                            Kfold=StratifiedKFold(self.iris.target, 3),
                            stack_by_proba=False)
        sl.fit(self.iris.data, self.iris.target)
        score = sl.score(self.iris.data, self.iris.target)
        self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
项目:gcForest    作者:kingfengji    | 项目源码 | 文件源码
def prec_log(X_train, y_train, X_test, y_test):
    from sklearn.linear_model import LogisticRegression
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    clf = LogisticRegression(solver='sag', n_jobs=-1, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_log={:.6f}%'.format(prec*100.0))
    return clf, y_pred
项目:abcpy    作者:eth-cscs    | 项目源码 | 文件源码
def distance(self, d1, d2):
        # Extract summary statistics from the dataset
        s1 = self.statistics_calc.statistics(d1)
        s2 = self.statistics_calc.statistics(d2)

        # compute distnace between the statistics
        training_set_features = np.concatenate((s1, s2), axis=0)
        label_s1 = np.zeros(shape=(len(s1), 1))
        label_s2 = np.ones(shape=(len(s2), 1))
        training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel()

        reg_inv = 1e5
        log_reg_model = linear_model.LogisticRegression(C=reg_inv, penalty='l1')
        log_reg_model.fit(training_set_features, training_set_labels)
        score = log_reg_model.score(training_set_features, training_set_labels)
        distance = 2.0 * (score - 0.5)

        return distance
项目:assignments    作者:iit-cs579    | 项目源码 | 文件源码
def cross_validation_accuracy(clf, X, labels, k):
    """
    Compute the average testing accuracy over k folds of cross-validation. You
    can use sklearn's KFold class here (no random seed, and no shuffling
    needed).

    Params:
      clf......A LogisticRegression classifier.
      X........A csr_matrix of features.
      labels...The true labels for each instance in X
      k........The number of cross-validation folds.

    Returns:
      The average testing accuracy of the classifier
      over each fold of cross-validation.
    """
    ###TODO
    pass
项目:assignments    作者:iit-cs579    | 项目源码 | 文件源码
def fit_best_classifier(docs, labels, best_result):
    """
    Using the best setting from eval_all_combinations,
    re-vectorize all the training data and fit a
    LogisticRegression classifier to all training data.
    (i.e., no cross-validation done here)

    Params:
      docs..........List of training document strings.
      labels........The true labels for each training document (0 or 1)
      best_result...Element of eval_all_combinations
                    with highest accuracy
    Returns:
      clf.....A LogisticRegression classifier fit to all
            training data.
      vocab...The dict from feature name to column index.
    """
    ###TODO
    pass
项目:assignments    作者:iit-cs579    | 项目源码 | 文件源码
def top_coefs(clf, label, n, vocab):
    """
    Find the n features with the highest coefficients in
    this classifier for this label.
    See the .coef_ attribute of LogisticRegression.

    Params:
      clf.....LogisticRegression classifier
      label...1 or 0; if 1, return the top coefficients
              for the positive class; else for negative.
      n.......The number of coefficients to return.
      vocab...Dict from feature name to column index.
    Returns:
      List of (feature_name, coefficient) tuples, SORTED
      in descending order of the coefficient for the
      given class label.
    """
    ###TODO
    pass
项目:assignments    作者:iit-cs579    | 项目源码 | 文件源码
def print_top_misclassified(test_docs, test_labels, X_test, clf, n):
    """
    Print the n testing documents that are misclassified by the
    largest margin. By using the .predict_proba function of
    LogisticRegression <https://goo.gl/4WXbYA>, we can get the
    predicted probabilities of each class for each instance.
    We will first identify all incorrectly classified documents,
    then sort them in descending order of the predicted probability
    for the incorrect class.
    E.g., if document i is misclassified as positive, we will
    consider the probability of the positive class when sorting.

    Params:
      test_docs.....List of strings, one per test document
      test_labels...Array of true testing labels
      X_test........csr_matrix for test data
      clf...........LogisticRegression classifier fit on all training
                    data.
      n.............The number of documents to print.

    Returns:
      Nothing; see Log.txt for example printed output.
    """
    ###TODO
    pass
项目:nlp100knock    作者:ff9900    | 项目源码 | 文件源码
def task73(features):

    features = numpy.array(features)
    words = list(set(features[:, 1]))

    pos_vec = numpy.zeros(len(words))
    neg_vec = numpy.zeros(len(words))

    for feature in features:
        index = words.index(feature[1])
        if feature[0] == '-1':
            pos_vec[index] += 1
        else:
            neg_vec[index] += 1

    model = linear_model.LogisticRegression()
    model.fit([pos_vec, neg_vec], [1, -1])

    return (words, model)
项目:question-answering    作者:emorynlp    | 项目源码 | 文件源码
def test_lr_on_data(X_train, y_train, X_validate, y_validate, X_test, y_test):
    y_train_flatten = list(itertools.chain(*y_train))

    # Train LR Model
    lr = LogisticRegression(solver='lbfgs')
    lr.fit(X_train, y_train_flatten)

    # Test model on validation set
    predictions_val = lr.predict_proba(X_validate)
    predictions_val = array([i[-1] for i in predictions_val])
    best_threshold_validate = find_threshold_logistic(y_validate, predictions_val, predictions_val)
    precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, predictions_val, predictions_val,
                                                                best_threshold_validate)
    globals.logger.info("Found threshold: %f. Precision/recall/f1 over validation set: %f/%f/%f" %
                        (best_threshold_validate, precision_val, recall_val, f1_val))

    # Test model on test set
    predictions_test = lr.predict_proba(X_test)
    predictions_test = array([i[-1] for i in predictions_test])
    best_threshold_test = find_threshold_logistic(y_test, predictions_test, predictions_test, verbose=True)
    precision, recall, f1 = evaluate_with_threshold(y_test, predictions_test, predictions_test, best_threshold_test)
    globals.logger.info("Found threshold: %f. Precision/recall/f1 over test set: %f/%f/%f" %
                        (best_threshold_test, precision, recall, f1))

    return precision, recall, f1
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def convert(model, feature_names, target):
    """Convert a Logistic Regression model to the protobuf spec.
    Parameters
    ----------
    model: LogisticRegression
        A trained LogisticRegression model.

    feature_names: [str], optional (default=None)
        Name of the input columns.

    target: str, optional (default=None)
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, LogisticRegression)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))

    return _MLModel(_convert(model, feature_names, target))
项目:AutismVoicePrint    作者:opraveen    | 项目源码 | 文件源码
def train_using_logistic(feat1, feat2):

    n_plus = len(feat1)
    n_minus = len(feat2)

    X = np.concatenate((feat1, feat2), axis=0)
    y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0)
    y = y + 1

    print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape)

    logreg = linear_model.LogisticRegression(C=1e5)
    logreg.fit(X, y)

    print("Score using logistic regression on training data is ", logreg.score(X, y))
    return logreg
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_LR_model(file_name):
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Logistic Regression Model...'
    start_time  = datetime.datetime.now()
    clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced')
    clf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time-start_time).seconds

    print 'Save Model...'
    joblib.dump(clf, 'LR.model')
    return clf
项目:magic    作者:pan-webis-de    | 项目源码 | 文件源码
def get_classifier(method='logistic_regression'):
    if 'logistic_regression' == method:
        return LogisticRegression(C=1e3,
                                  tol=0.01,
                                  multi_class='ovr',
                                  solver='liblinear',
                                  n_jobs=-1,
                                  random_state=123)
    if 'random_forest' == method:
        return RandomForestClassifier(n_estimators=250,
                                      bootstrap=False,
                                      n_jobs=-1,
                                      random_state=123)

    if 'gradient_boosting' == method:
        return xgb.XGBClassifier(max_depth=10,
                                 subsample=0.7,
                                 n_estimators=500,
                                 min_child_weight=0.05,
                                 colsample_bytree=0.3,
                                 learning_rate=0.1)
项目:Using-machine-learning-to-detect-malicious-URLs    作者:faizann24    | 项目源码 | 文件源码
def TL():
    allurls = './data/data.csv' #path to our all urls file
    allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
    allurlsdata = pd.DataFrame(allurlscsv)  #converting to a dataframe

    allurlsdata = np.array(allurlsdata) #converting it into an array
    random.shuffle(allurlsdata) #shuffling

    y = [d[1] for d in allurlsdata] #all labels 
    corpus = [d[0] for d in allurlsdata]    #all urls corresponding to a label (either good or bad)
    vectorizer = TfidfVectorizer(tokenizer=getTokens)   #get a vector for each url but use our customized tokenizer
    X = vectorizer.fit_transform(corpus)    #get the X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #split into training and testing set 80/20 ratio

    lgs = LogisticRegression()  #using logistic regression
    lgs.fit(X_train, y_train)
    print(lgs.score(X_test, y_test))    #pring the score. It comes out to be 98%
    return vectorizer, lgs
项目:scikit-mdr    作者:EpistasisLab    | 项目源码 | 文件源码
def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.
项目:scikit-mdr    作者:EpistasisLab    | 项目源码 | 文件源码
def test_mdr_sklearn_pipeline_parallel():
    """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
    assert np.mean(cv_scores) > 0.
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Logistic:'
        # print classification_report(self.testLabel, pred_labels)

        self.classifier = SVC()
        self.classifier.fit(self.trainingSet, self.trainingLabel)
        pred_labels = {}
        for user in self.testDict:
            pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
        # print 'SVM:'
        # print classification_report(self.testLabel, pred_labels)

        # classifier = DecisionTreeClassifier(criterion='entropy')
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Decision Tree:'
        # print classification_report(self.testLabel, pred_labels)
        # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel

        return pred_labels
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        corpus = self.trainingSet+self.testSet
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = models.TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T

        if PCA_Applied:
            pca = PCA(n_components=PCA_nComponents)
            text_matrix = pca.fit_transform(text_matrix)

        classifier = LogisticRegression()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'Logistic:'
        print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel)
        pred_labels = classifier.predict(text_matrix[len(self.trainingSet):])
        print 'SVM:'
        print classification_report(self.testLabel, pred_labels)
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Logistic:'
        # print classification_report(self.testLabel, pred_labels)
        pred_labels = {}
        classifier = SVC()
        classifier.fit(self.trainingSet, self.trainingLabel)

        for user in self.testDict:
            pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
        # print 'SVM:'
        # print classification_report(self.testLabel, pred_labels)
        return pred_labels

        # classifier = DecisionTreeClassifier(criterion='entropy')
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Decision Tree:'
        # print classification_report(self.testLabel, pred_labels)
        # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
项目:WEARING    作者:nlkim0817    | 项目源码 | 文件源码
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]):
    tr_accs = []
    va_accs = []
    models = []
    for C in Cs:
        model = LR(C=C)
        model.fit(trX, trY)
        tr_pred = model.predict(trX)
        va_pred = model.predict(vaX)
        tr_acc = metrics.accuracy_score(trY, tr_pred)
        va_acc = metrics.accuracy_score(vaY, va_pred)
        print '%.4f %.4f %.4f'%(C, tr_acc, va_acc)
        tr_accs.append(tr_acc)
        va_accs.append(va_acc)
        models.append(model)
    best = np.argmax(va_accs)
    print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best])
    return models[best]
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def prepare_fit_model_for_factors(model_type, x_train, y_train):
    """
    Given a model type, train and test data

    Args:
        model_type (str): 'classification' or 'regression'
        x_train:
        y_train:

    Returns:
        (sklearn.base.BaseEstimator): A fit model.
    """

    if model_type == 'classification':
        algorithm = LogisticRegression()
    elif model_type == 'regression':
        algorithm = LinearRegression()
    else:
        algorithm = None

    if algorithm is not None:
        algorithm.fit(x_train, y_train)

    return algorithm
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def __init__(self, info, verbose=True, debug_mode=False):
        self.label_num=info['label_num']
        self.target_num=info['target_num']
        self.task = info['task']
        self.metric = info['metric']
        self.postprocessor = None
        #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba
        self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba
        if debug_mode>=2:
            self.name = "RandomPredictor"
            self.model = RandomPredictor(self.target_num)
            self.predict_method = self.model.predict_proba 
            return
        if info['task']=='regression':
            if info['is_sparse']==True:
                self.name = "BaggingRidgeRegressor"
                self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            else:
                self.name = "GradientBoostingRegressor"
                self.model = GradientBoostingRegressor(n_estimators=1,  max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True)
            self.predict_method = self.model.predict # Always predict probabilities
        else:
            if info['has_categorical']: # Out of lazziness, we do not convert categorical variables...
                self.name = "RandomForestClassifier"
                self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start...
            elif info['is_sparse']:                
                self.name = "BaggingNBClassifier"
                self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start...                          
            else:
                self.name = "GradientBoostingClassifier"
                self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)")
            if info['task']=='multilabel.classification':
                self.model = MultiLabelEnsemble(self.model)
            self.predict_method = self.model.predict_proba
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
    clf = LogisticRegression()
    clf = clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
          '\n Test:', zero_one_score(Y_test, pred))

    return clf
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
    clf = LogisticRegression()
    clf = clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
          '\n Test:', zero_one_score(Y_test, pred))

    return clf
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
    clf = LogisticRegression()
    clf = clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
          '\n Test:', zero_one_score(Y_test, pred))

    return clf
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
    clf = LogisticRegression()
    clf = clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
          '\n Test:', zero_one_score(Y_test, pred))

    return clf
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test):
    clf = LogisticRegression()
    clf = clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)
    print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)),
          '\n Test:', zero_one_score(Y_test, pred))

    return clf
项目:conec    作者:cod3licious    | 项目源码 | 文件源码
def train_clf(self, trainfiles):
        # tokens: list of words, labels: list of corresponding labels
        # go document by document because of local context
        final_labels = []
        featmat = []
        for trainfile in trainfiles:
            for tokens, labels in yield_tokens_labels(trainfile):
                final_labels.extend(labels)
                featmat.append(self.make_featmat_rep(tokens))
        featmat = np.vstack(featmat)
        print("training classifier")
        clf = logreg(class_weight='balanced', random_state=1)
        clf.fit(featmat, final_labels)
        self.clf = clf
项目:ISM2017    作者:ybayle    | 项目源码 | 文件源码
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
    """Description of compare
    compare multiple classifier and display the best one
    """
    utils.print_success("Comparison of differents classifiers")
    if data is not None:
        train_features = data["train_features"]
        train_groundtruths = data["train_groundtruths"]
        test_features = data["test_features"]
        test_groundtruths = data["test_groundtruths"]
    else:
        train = utils.abs_path_file(train)
        test = utils.abs_path_file(test)
        train_features, train_groundtruths = read_file(train)
        test_features, test_groundtruths = read_file(test)
    if not utils.create_dir(res_dir):
        res_dir = utils.abs_path_dir(res_dir)
    classifiers = {
        "RandomForest": RandomForestClassifier(n_jobs=-1)
        # "RandomForest": RandomForestClassifier(n_estimators=5),
        # "KNeighbors":KNeighborsClassifier(3),
        # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        # "DecisionTree":DecisionTreeClassifier(max_depth=5),
        # "MLP":MLPClassifier(),
        # "AdaBoost":AdaBoostClassifier(),
        # "GaussianNB":GaussianNB(),
        # "QDA":QuadraticDiscriminantAnalysis(),
        # "SVM":SVC(kernel="linear", C=0.025),
        # "GradientBoosting":GradientBoostingClassifier(),
        # "ExtraTrees":ExtraTreesClassifier(),
        # "LogisticRegression":LogisticRegression(),
        # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
    }
    for key in classifiers:
        utils.print_success(key)
        clf = classifiers[key]
        utils.print_info("\tFit")
        clf.fit(train_features, train_groundtruths)
        utils.print_info("\tPredict")
        predictions = clf.predict(test_features)
    return predictions
项目:PersonalizedMultitaskLearning    作者:mitmedialab    | 项目源码 | 文件源码
def reloadHelper():
    reload(helper)

# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html