Python sklearn.metrics 模块,log_loss() 实例源码


项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def rf1(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 300
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    for n, (itrain, ival) in enumerate(skf.split(train2, y)):
        print('step %d of %d'%(n+1, skf.n_splits), now())
        clf = ensemble.RandomForestRegressor(n_estimators=1000,
                                             random_state=13)[itrain], y[itrain])

        p = clf.predict(train2[ival])
        v.loc[ival, cname] += p
        score = metrics.log_loss(y[ival], p)
        z[cname]  += np.log1p(clf.predict(test2))
        print(cname, 'step %d: score'%(n+1), score, now())

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def test_stacked_classfier_extkfold(self):
        bclf = LogisticRegression(random_state=1)
        clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
        sl = StackedClassifier(bclf,
                               Kfold=StratifiedKFold(, 3),
        score = sl.score(,
        self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
项目:kaggle_airbnb    作者:svegapons    | 项目源码 | 文件源码
def opt_2_obj_func(w, X, y, n_class):
    Function to be minimized in the EN_OPT_2 ensembler.
    In this case there is only one weight for each classification restlt to be 
    w: ndarray size=(n_preds)
       Candidate solution to the optimization problem (vector of weights).
    X: ndarray size=(n_samples, n_preds * n_class)
       Solutions to be combined horizontally concatenated.
    y: ndarray size=(n_samples,)
       Class labels
    n_class: int
       Number of classes in the problem, i.e. = 12
    w = np.abs(w)
    sol = np.zeros((X.shape[0], n_class))
    for i in range(len(w)):
        sol += X[:, i*n_class:(i+1)*n_class] * w[i]
    #Minimizing the logloss   
    sc_ll = log_loss(y, sol)
    return sc_ll
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def cross_validate(train):
    #separate training and validation set
    X_train,X_valid= split_train_validation(train)
    scores = []; preds = []
    for i in xrange(len(X_train)):
        #convert X_train, Y_train etc... to xgboost matrix
        dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) 
        dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan)

        #predict with xgboost
        parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8,
        plst = parameters.items()
        bst = xgb.train(plst, dtrain)
        pred = bst.predict(dvalid)

        pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_)
    return scores, preds
项目:tencent_social_algo    作者:Folieshell    | 项目源码 | 文件源码
def check_log_loss(max_depth, n_splits, test_size):
    model = RandomForestClassifier(max_depth=max_depth, n_jobs=-1, random_state=777)
    trn_scores = []
    vld_scores = []
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=777)
    for i, (t_ind, v_ind) in enumerate(sss.split(feature_train, trainY)):
        print('# Iter {} / {}'.format(i + 1, n_splits))
        x_trn = feature_train.values[t_ind]
        y_trn = trainY[t_ind]
        x_vld = feature_train.values[v_ind]
        y_vld = trainY[v_ind], y_trn)

        score = log_loss(y_trn, model.predict_proba(x_trn))

        score = log_loss(y_vld, model.predict_proba(x_vld))

    print("max_depth: %d   n_splits: %d    test_size: %f" % (max_depth, n_splits, test_size))
    print('# TRN logloss: {}'.format(np.mean(trn_scores)))
    print('# VLD logloss: {}'.format(np.mean(vld_scores)))
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                n_jobs = jobs_val,
                random_state = random_state_val), train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                n_jobs = jobs_val,
                random_state = random_state_val), train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0):
        clf = ensemble.ExtraTreesClassifier(
                n_estimators = n_est_val,
                max_depth = depth_val,
                min_samples_split = split_val,
                min_samples_leaf = leaf_val,
                max_features = feat_val,
                n_jobs = jobs_val,
                random_state = random_state_val), train_y)
        pred_train_y = clf.predict_proba(train_X)[:,1]
        pred_test_y = clf.predict_proba(test_X)[:,1]

        if validation:
                train_loss = log_loss(train_y, pred_train_y)
                loss = log_loss(test_y, pred_test_y)
                print "Train, Test loss : ", train_loss, loss
                return pred_test_y, loss
                return pred_test_y
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def extratreescv(n_estimators,

    clf = ExtraTreesClassifier(n_estimators=int(n_estimators),
                               max_features= int(max_features),
                               max_depth = int(max_depth),
                               min_weight_fraction_leaf = min_weight_fraction_leaf,
                               verbose=1), y0)
    ll = -log_loss(y1, clf.predict_proba(x1)[:,1])
    return ll
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    print(cv, cv.mean(), cv.std())
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def pac_metric (solution, prediction, task='binary.classification'):
    ''' Probabilistic Accuracy based on log_loss metric. 
    We assume the solution is in {0, 1} and prediction in [0, 1].
    Otherwise, run normalize_array.''' 
    [sample_num, label_num] = solution.shape
    if label_num==1: task='binary.classification'
    eps = 1e-15
    the_log_loss = log_loss(solution, prediction, task)
    # Compute the base log loss (using the prior probabilities)    
    pos_num = 1.* sum(solution) # float conversion!
    frac_pos = pos_num / sample_num # prior proba of positive class
    the_base_log_loss = prior_log_loss(frac_pos, task)
    # Alternative computation of the same thing (slower)    
    # Should always return the same thing except in the multi-label case
    # For which the analytic solution makes more sense
    if debug_flag:
        base_prediction = np.empty(prediction.shape)
        for k in range(sample_num): base_prediction[k,:] = frac_pos
        base_log_loss = log_loss(solution, base_prediction, task)  
        diff = np.array(abs(the_base_log_loss-base_log_loss))
        if len(diff.shape)>0: diff=max(diff)
            print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss))
    # Exponentiate to turn into an accuracy-like score.
    # In the multi-label case, we need to average AFTER taking the exp 
    # because it is an NL operation
    pac = mvmean(np.exp(-the_log_loss)) 
    base_pac = mvmean(np.exp(-the_base_log_loss))
    # Normalize: 0 for random, 1 for perfect    
    score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
    return score
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def log_loss(solution, prediction, task = 'binary.classification'):
    ''' Log loss for binary and multiclass. '''
    [sample_num, label_num] = solution.shape
    eps = 1e-15

    pred = np.copy(prediction) # beware: changes in prediction occur through this
    sol = np.copy(solution)
    if (task == 'multiclass.classification') and (label_num>1):
        # Make sure the lines add up to one for multi-class classification
        norma = np.sum(prediction, axis=1)
        for k in range(sample_num):
            pred[k,:] /= sp.maximum (norma[k], eps) 
        # Make sure there is a single label active per line for multi-class classification
        sol = binarize_predictions(solution, task='multiclass.classification')
        # For the base prediction, this solution is ridiculous in the multi-label case

    # Bounding of predictions to avoid log(0),1/0,...
    pred = sp.minimum (1-eps, sp.maximum (eps, pred))
    # Compute the log loss    
    pos_class_log_loss = - mvmean(sol*np.log(pred), axis=0)
    if (task != 'multiclass.classification') or (label_num==1):
        # The multi-label case is a bunch of binary problems.
        # The second class is the negative class for each column.
        neg_class_log_loss = - mvmean((1-sol)*np.log(1-pred), axis=0)
        log_loss = pos_class_log_loss + neg_class_log_loss
        # Each column is an independent problem, so we average.
        # The probabilities in one line do not add up to one.
        # log_loss = mvmean(log_loss) 
        # print('binary {}'.format(log_loss))
        # In the multilabel case, the right thing i to AVERAGE not sum
        # We return all the scores so we can normalize correctly later on
        # For the multiclass case the probabilities in one line add up one.
        log_loss = pos_class_log_loss
        # We sum the contributions of the columns.
        log_loss = np.sum(log_loss) 
        #print('multiclass {}'.format(log_loss))
    return log_loss
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def log_loss_(solution, prediction):
    return metrics.log_loss(solution, prediction)
项目:hyperband    作者:zygmuntz    | 项目源码 | 文件源码
def train_and_eval_sklearn_classifier( clf, data ):

    x_train = data['x_train']
    y_train = data['y_train']

    x_test = data['x_test']
    y_test = data['y_test'] x_train, y_train ) 

        p = clf.predict_proba( x_train )[:,1]   # sklearn convention
    except IndexError:
        p = clf.predict_proba( x_train )

    ll = log_loss( y_train, p )
    auc = AUC( y_train, p )
    acc = accuracy( y_train, np.round( p ))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc )


        p = clf.predict_proba( x_test )[:,1]    # sklearn convention
    except IndexError:
        p = clf.predict_proba( x_test )

    ll = log_loss( y_test, p )
    auc = AUC( y_test, p )
    acc = accuracy( y_test, np.round( p ))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc ) 

    #return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc }
    return { 'loss': ll, 'log_loss': ll, 'auc': auc }


# "clf", even though it's a regressor
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def predict_proba_with_loss(self, X, y):
        y_pred = self.predict_proba(X)
        loss = log_loss(y,y_pred)
        return y_pred, loss

    # smallest prob given to an actual catastrophe
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def predict_proba_with_loss(self, X, y):
        y_pred = self.predict_proba(X)
        loss = log_loss(y,y_pred)
        return y_pred, loss

    # smallest prob given to an actual catastrophe
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def predict_proba_with_loss(self, X, y):
        y_pred = self.predict_proba(X)
        loss = log_loss(y,y_pred)
        return y_pred, loss

    # smallest prob given to an actual catastrophe
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def predict_proba_with_loss(self, X, y):
        y_pred = self.predict_proba(X)
        loss = log_loss(y,y_pred)
        return y_pred, loss

    # smallest prob given to an actual catastrophe
项目:human-rl    作者:gsastry    | 项目源码 | 文件源码
def predict_proba_with_loss(self, X, y):
        y_pred = self.predict_proba(X)
        loss = log_loss(y,y_pred)
        return y_pred, loss

    # smallest prob given to an actual catastrophe
项目:AutoML4    作者:djajetic    | 项目源码 | 文件源码
def pac_metric (solution, prediction, task='binary.classification'):
    ''' Probabilistic Accuracy based on log_loss metric. 
    We assume the solution is in {0, 1} and prediction in [0, 1].
    Otherwise, run normalize_array.''' 
    [sample_num, label_num] = solution.shape
    if label_num==1: task='binary.classification'
    eps = 1e-15
    the_log_loss = log_loss(solution, prediction, task)
    # Compute the base log loss (using the prior probabilities)    
    pos_num = 1.* sum(solution) # float conversion!
    frac_pos = pos_num / sample_num # prior proba of positive class
    the_base_log_loss = prior_log_loss(frac_pos, task)
    # Alternative computation of the same thing (slower)    
    # Should always return the same thing except in the multi-label case
    # For which the analytic solution makes more sense
    if debug_flag:
        base_prediction = np.empty(prediction.shape)
        for k in range(sample_num): base_prediction[k,:] = frac_pos
        base_log_loss = log_loss(solution, base_prediction, task)  
        diff = np.array(abs(the_base_log_loss-base_log_loss))
        if len(diff.shape)>0: diff=max(diff)
            print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss))
    # Exponentiate to turn into an accuracy-like score.
    # In the multi-label case, we need to average AFTER taking the exp 
    # because it is an NL operation
    pac = mvmean(np.exp(-the_log_loss)) 
    base_pac = mvmean(np.exp(-the_base_log_loss))
    # Normalize: 0 for random, 1 for perfect    
    score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
    return score
项目:AutoML4    作者:djajetic    | 项目源码 | 文件源码
def log_loss(solution, prediction, task = 'binary.classification'):
    ''' Log loss for binary and multiclass. '''
    [sample_num, label_num] = solution.shape
    eps = 1e-15

    pred = np.copy(prediction) # beware: changes in prediction occur through this
    sol = np.copy(solution)
    if (task == 'multiclass.classification') and (label_num>1):
        # Make sure the lines add up to one for multi-class classification
        norma = np.sum(prediction, axis=1)
        for k in range(sample_num):
            pred[k,:] /= sp.maximum (norma[k], eps) 
        # Make sure there is a single label active per line for multi-class classification
        sol = binarize_predictions(solution, task='multiclass.classification')
        # For the base prediction, this solution is ridiculous in the multi-label case

    # Bounding of predictions to avoid log(0),1/0,...
    pred = sp.minimum (1-eps, sp.maximum (eps, pred))
    # Compute the log loss    
    pos_class_log_loss = - mvmean(sol*np.log(pred), axis=0)
    if (task != 'multiclass.classification') or (label_num==1):
        # The multi-label case is a bunch of binary problems.
        # The second class is the negative class for each column.
        neg_class_log_loss = - mvmean((1-sol)*np.log(1-pred), axis=0)
        log_loss = pos_class_log_loss + neg_class_log_loss
        # Each column is an independent problem, so we average.
        # The probabilities in one line do not add up to one.
        # log_loss = mvmean(log_loss) 
        # print('binary {}'.format(log_loss))
        # In the multilabel case, the right thing i to AVERAGE not sum
        # We return all the scores so we can normalize correctly later on
        # For the multiclass case the probabilities in one line add up one.
        log_loss = pos_class_log_loss
        # We sum the contributions of the columns.
        log_loss = np.sum(log_loss) 
        #print('multiclass {}'.format(log_loss))
    return log_loss
项目:AutoML4    作者:djajetic    | 项目源码 | 文件源码
def log_loss_(solution, prediction):
    return metrics.log_loss(solution, prediction)
项目:automl_gpu    作者:abhishekkrthakur    | 项目源码 | 文件源码
def pac_metric (solution, prediction, task='binary.classification'):
    ''' Probabilistic Accuracy based on log_loss metric. 
    We assume the solution is in {0, 1} and prediction in [0, 1].
    Otherwise, run normalize_array.''' 
    [sample_num, label_num] = solution.shape
    if label_num==1: task='binary.classification'
    eps = 1e-15
    the_log_loss = log_loss(solution, prediction, task)
    # Compute the base log loss (using the prior probabilities)    
    pos_num = 1.* sum(solution) # float conversion!
    frac_pos = pos_num / sample_num # prior proba of positive class
    the_base_log_loss = prior_log_loss(frac_pos, task)
    # Alternative computation of the same thing (slower)    
    # Should always return the same thing except in the multi-label case
    # For which the analytic solution makes more sense
    if debug_flag:
        base_prediction = np.empty(prediction.shape)
        for k in range(sample_num): base_prediction[k,:] = frac_pos
        base_log_loss = log_loss(solution, base_prediction, task)  
        diff = np.array(abs(the_base_log_loss-base_log_loss))
        if len(diff.shape)>0: diff=max(diff)
            print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss))
    # Exponentiate to turn into an accuracy-like score.
    # In the multi-label case, we need to average AFTER taking the exp 
    # because it is an NL operation
    pac = mvmean(np.exp(-the_log_loss)) 
    base_pac = mvmean(np.exp(-the_base_log_loss))
    # Normalize: 0 for random, 1 for perfect    
    score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
    return score
项目:automl_gpu    作者:abhishekkrthakur    | 项目源码 | 文件源码
def log_loss(solution, prediction, task = 'binary.classification'):
    ''' Log loss for binary and multiclass. '''
    [sample_num, label_num] = solution.shape
    eps = 1e-15

    pred = np.copy(prediction) # beware: changes in prediction occur through this
    sol = np.copy(solution)
    if (task == 'multiclass.classification') and (label_num>1):
        # Make sure the lines add up to one for multi-class classification
        norma = np.sum(prediction, axis=1)
        for k in range(sample_num):
            pred[k,:] /= sp.maximum (norma[k], eps) 
        # Make sure there is a single label active per line for multi-class classification
        sol = binarize_predictions(solution, task='multiclass.classification')
        # For the base prediction, this solution is ridiculous in the multi-label case

    # Bounding of predictions to avoid log(0),1/0,...
    pred = sp.minimum (1-eps, sp.maximum (eps, pred))
    # Compute the log loss    
    pos_class_log_loss = - mvmean(sol*np.log(pred), axis=0)
    if (task != 'multiclass.classification') or (label_num==1):
        # The multi-label case is a bunch of binary problems.
        # The second class is the negative class for each column.
        neg_class_log_loss = - mvmean((1-sol)*np.log(1-pred), axis=0)
        log_loss = pos_class_log_loss + neg_class_log_loss
        # Each column is an independent problem, so we average.
        # The probabilities in one line do not add up to one.
        # log_loss = mvmean(log_loss) 
        # print('binary {}'.format(log_loss))
        # In the multilabel case, the right thing i to AVERAGE not sum
        # We return all the scores so we can normalize correctly later on
        # For the multiclass case the probabilities in one line add up one.
        log_loss = pos_class_log_loss
        # We sum the contributions of the columns.
        log_loss = np.sum(log_loss) 
        #print('multiclass {}'.format(log_loss))
    return log_loss
项目:automl_gpu    作者:abhishekkrthakur    | 项目源码 | 文件源码
def log_loss_(solution, prediction):
    return metrics.log_loss(solution, prediction)
项目:DeepFM    作者:dwt0317    | 项目源码 | 文件源码
def predict_test_file(preds, sess, test_file, feature_cnt, _indices, _values, _values2, _cont_values, _text_values, _shape,
                      _cont_shape, _text_shape, _y, _ind, epoch, batch_size, tag, path, output_prediction=True):
    day =
    if output_prediction:
        wt = open(path + '/'+str(day)+'_deepFM_pred_' + tag + str(epoch) + '.txt', 'w')

    gt_scores = []
    pred_scores = []

    for test_input_in_sp in load_data_cache(test_file):
        predictios =, feed_dict={
            _indices: test_input_in_sp['indices'], _values: test_input_in_sp['values'],
            _shape: test_input_in_sp['shape'], _cont_shape: test_input_in_sp['cont_shape'],
            _text_values: test_input_in_sp['text_values'], _text_shape: test_input_in_sp['text_shape'],

            _y: test_input_in_sp['labels'], _values2: test_input_in_sp['values2'],
            _cont_values: test_input_in_sp['cont_values'], _ind: test_input_in_sp['feature_indices']

        if output_prediction:
            for (gt, preded) in zip(test_input_in_sp['labels'].reshape(-1).tolist(), predictios):
                wt.write('{0:d},{1:f}\n'.format(int(gt), preded))
                # pred_scores.append(1.0 if preded >= 0.5 else 0.0)
    auc = metrics.roc_auc_score(np.asarray(gt_scores), np.asarray(pred_scores))
    logloss = metrics.log_loss(np.asarray(gt_scores), np.asarray(pred_scores))
    # print('auc is ', auc, ', at epoch  ', epoch)
    if output_prediction:
    return auc, logloss
项目:aboleth    作者:data61    | 项目源码 | 文件源码
def print_k_result(ys, Ep, ll, acc, name):
    acc.append(accuracy_score(ys, Ep.argmax(axis=1)))
    ll.append(log_loss(ys, Ep))
    print("{}: accuracy = {:.4g}, log-loss = {:.4g}"
          .format(name, acc[-1], ll[-1]))
项目:Quantrade    作者:quant-trade    | 项目源码 | 文件源码
def main():
    validate = True
    n = SData(validate=validate)

    Xtrain = n.train_features.as_matrix()
    ytrain = n.train_targets
    Xtest = n.test_features.as_matrix()
    ytest = n.test_targets

    Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest  = np.reshape(Xtest, (Xtest.shape[0],  Xtest.shape[1], 1))

    rnn = RNN([1, 100, 100, 1]), ytrain)
    p = rnn.predict(Xtest)
    p_prob = rnn.predict(Xtest)

    if validate:
        mse = mean_squared_error(ytest, p)
        print("MSE: {}".format(mse))
        loss = log_loss(ytest, p_prob)
        print("Log loss: {}".format(loss))
        base_path = dirname(__file__)
        results_df = DataFrame(data={'probability':results})
        joined = DataFrame(t_id).join(results_df)
        joined.to_csv(join(base_path, 'results', 'dl.csv'), index=False)
项目:molearn    作者:jmread    | 项目源码 | 文件源码
def Log_loss(Ytest,Ydist):
    return log_loss(Ytest, Ydist, eps=1e-15, normalize=True)
#    N_test,L = Ytest.shape
#    return sum((Ytest == Ypred) * 1.) / N_test / L
项目:nfm    作者:faychu    | 项目源码 | 文件源码
def parse_args():
    parser = argparse.ArgumentParser(description="Run FM.")
    parser.add_argument('--path', nargs='?', default='./data/',
                        help='Input data path.')
    parser.add_argument('--dataset', nargs='?', default='frappe',
                        help='Choose a dataset.')
    parser.add_argument('--epoch', type=int, default=100,
                        help='Number of epochs.')
    parser.add_argument('--pretrain', type=int, default=-1,
                        help='flag for pretrain. 1: initialize from pretrain; 0: randomly initialize; -1: save the model to pretrain file')
    parser.add_argument('--batch_size', type=int, default=128,
                        help='Batch size.')
    parser.add_argument('--hidden_factor', type=int, default=64,
                        help='Number of hidden factors.')
    parser.add_argument('--lamda', type=float, default=0,
                        help='Regularizer for bilinear part.')
    parser.add_argument('--keep_prob', type=float, default=0.5, 
                    help='Keep probility (1-dropout_ratio) for the Bi-Interaction layer. 1: no dropout')
    parser.add_argument('--lr', type=float, default=0.05,
                        help='Learning rate.')
    parser.add_argument('--loss_type', nargs='?', default='square_loss',
                        help='Specify a loss type (square_loss or log_loss).')
    parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer',
                        help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).')
    parser.add_argument('--verbose', type=int, default=1,
                        help='Show the results per X epochs (0, 1 ... any positive integer)')
    parser.add_argument('--batch_norm', type=int, default=0,
                    help='Whether to perform batch normaization (0 or 1)')

    return parser.parse_args()
项目:nfm    作者:faychu    | 项目源码 | 文件源码
def evaluate(self, data):  # evaluate the results for an input set
        num_example = len(data['Y'])
        feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: 1.0, self.train_phase: False}
        predictions =, feed_dict=feed_dict)
        y_pred = np.reshape(predictions, (num_example,))
        y_true = np.reshape(data['Y'], (num_example,))
        if self.loss_type == 'square_loss':    
            predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true))  # bound the lower values
            predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true))  # bound the higher values
            RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))
            return RMSE
        elif self.loss_type == 'log_loss':
            logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss
            return logloss
项目:nfm    作者:faychu    | 项目源码 | 文件源码
def parse_args():
    parser = argparse.ArgumentParser(description="Run Neural FM.")
    parser.add_argument('--path', nargs='?', default='../data/',
                        help='Input data path.')
    parser.add_argument('--dataset', nargs='?', default='frappe',
                        help='Choose a dataset.')
    parser.add_argument('--epoch', type=int, default=200,
                        help='Number of epochs.')
    parser.add_argument('--pretrain', type=int, default=0,
                        help='Pre-train flag. 0: train from scratch; 1: load from pretrain file')
    parser.add_argument('--batch_size', type=int, default=128,
                        help='Batch size.')
    parser.add_argument('--hidden_factor', type=int, default=64,
                        help='Number of hidden factors.')
    parser.add_argument('--layers', nargs='?', default='[64]',
                        help="Size of each layer.")
    parser.add_argument('--keep_prob', nargs='?', default='[0.8,0.5]', 
                        help='Keep probability (i.e., 1-dropout_ratio) for each deep layer and the Bi-Interaction layer. 1: no dropout. Note that the last index is for the Bi-Interaction layer.')
    parser.add_argument('--lamda', type=float, default=0,
                        help='Regularizer for bilinear part.')
    parser.add_argument('--lr', type=float, default=0.05,
                        help='Learning rate.')
    parser.add_argument('--loss_type', nargs='?', default='square_loss',
                        help='Specify a loss type (square_loss or log_loss).')
    parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer',
                        help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).')
    parser.add_argument('--verbose', type=int, default=1,
                        help='Show the results per X epochs (0, 1 ... any positive integer)')
    parser.add_argument('--batch_norm', type=int, default=1,
                    help='Whether to perform batch normaization (0 or 1)')
    parser.add_argument('--activation', nargs='?', default='relu',
                    help='Which activation function to use for deep layers: relu, sigmoid, tanh, identity')
    parser.add_argument('--early_stop', type=int, default=1,
                    help='Whether to perform early stop (0 or 1)')
    return parser.parse_args()
项目:nfm    作者:faychu    | 项目源码 | 文件源码
def evaluate(self, data):  # evaluate the results for an input set
        num_example = len(data['Y'])
        feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: self.no_dropout, self.train_phase: False}
        predictions =, feed_dict=feed_dict)
        y_pred = np.reshape(predictions, (num_example,))
        y_true = np.reshape(data['Y'], (num_example,))
        if self.loss_type == 'square_loss':    
            predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true))  # bound the lower values
            predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true))  # bound the higher values
            RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))
            return RMSE
        elif self.loss_type == 'log_loss':
            logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss
            return logloss
项目:KAGGLE_CERVICAL_CANCER_2017    作者:ZFTurbo    | 项目源码 | 文件源码
def check_score(subm_file):
    real_answ = "../modified_data/answers_stage1.csv"
    real = pd.read_csv(real_answ)
    pred = pd.read_csv(subm_file)
    real['s'] = 0
    real.loc[real['Type_1'] > 0, 's'] = 0
    real.loc[real['Type_2'] > 0, 's'] = 1
    real.loc[real['Type_3'] > 0, 's'] = 2
    pred = pd.merge(pred, real[['image_name', 's']], on=['image_name'], left_index=True)
    score = log_loss(pred['s'], pred[['Type_1', 'Type_2', 'Type_3']].as_matrix())
    return score
项目:Telstra    作者:minjay    | 项目源码 | 文件源码
def predict(self, clf, X, y, X_test, stage):
        n_train = X.shape[0]
        kf = KFold(n_train, n_folds=self.n_fold, shuffle=True)
        best_score = []
        y_pred_sum = np.zeros((X_test.shape[0], self.num_class))
        if stage=='base':
            meta_feat = np.zeros((n_train+X_test.shape[0], self.num_class))
        i = 0
        for train, val in kf:
            i += 1
            X_train, X_val, y_train, y_val = X[train], X[val], y[train], y[val]
            ## CV sets
            # train
  , y_train)
            curr_pred = clf.predict_proba(X_val)
            curr_best_score = log_loss(y_val, curr_pred)
            best_score += [curr_best_score]
            # predict
            if stage=='base':
                meta_feat[val, :] = curr_pred
                y_pred = clf.predict_proba(X_test)
                y_pred_sum = y_pred_sum+y_pred
        print(np.mean(best_score), np.std(best_score))
        ## test set
        if stage=='base':
            # train
  , y)
            # predict
            meta_feat[n_train:, :] = clf.predict_proba(X_test)
            return meta_feat
            y_pred = y_pred_sum/self.n_fold
            return y_pred
项目:Kaggle_the_Nature_Conservancy_Fisheries_Monitoring    作者:Sapphirine    | 项目源码 | 文件源码
def print_clf(clf, trainx, testx, trainy, testy):
    start = time.time()
    model =, trainy)
    end = time.time()
    pred = model.predict(testx)
    print "log_loss: ", log_loss(testy, model.predict_proba(testx))
    print confusion_matrix(np.array(testy), pred)
项目:RIDDLE    作者:jisungk    | 项目源码 | 文件源码
def loss_scorer(estimator, x, y):
    loss = log_loss(y, estimator.predict_proba(x))
    assert loss >= 0
    # minimal loss is best
    # however, we try to maximize the score
    # to account for this we take negative loss
    return -loss
项目:Kaggler    作者:qqgeogor    | 项目源码 | 文件源码
def logloss(y, p):
    """Bounded log loss error.

        y (numpy.array): target
        p (numpy.array): prediction

        bounded log loss error

    p[p < 1e-15] = 1e-15
    p[p > 1 - 1e-15] = 1 - 1e-15
    return log_loss(y, p)
项目:LearnGraphDiscovery    作者:eugenium    | 项目源码 | 文件源码
def evalData(z,test_set_y):
    " z- prediction test_set_y is the truth "
    fpr, tpr, thresholds = metrics.roc_curve(test_set_y.ravel(), z.ravel(), pos_label=1)
    auc=metrics.auc(fpr, tpr)
    ap=metrics.average_precision_score(test_set_y.ravel(), z.ravel())

    for i in range(Q):
        Pk10+=ranking_precision_score(test_set_y[i], z[i], k=10)
        Pk20+=ranking_precision_score(test_set_y[i], z[i], k=20)
        Pk30+=ranking_precision_score(test_set_y[i], z[i], k=30)
        Pk37+=ranking_precision_score(test_set_y[i], z[i], k=37)
        Pk50+=ranking_precision_score(test_set_y[i], z[i], k=30)
    print '\n'
    print 'AUC',auc,'MSE',np.mean((diff)**2),'Cross-entropy:',cross
    print 'Precision at k=10: ',Pk10,' k=20: ',Pk20,' k=30: ',Pk30,' k=50: ',Pk50, ' k=37: ',Pk37
    return Pk37
项目:kaggle_airbnb    作者:svegapons    | 项目源码 | 文件源码
def opt_1_obj_func(w, X, y, n_class):
    Function to be minimized in the EN_OPT_1 ensembler.
    w: ndarray size=(n_preds * n_class)
       Candidate solution to the optimization problem (vector of weights).
    X: ndarray size=(n_samples, n_preds * n_class)
       Solutions to be combined horizontally concatenated.
    y: ndarray size=(n_samples,)
       Class labels
    n_class: int
       Number of classes in the problem, i.e. = 12
    #Constraining the weights for each class to sum 1.
    #This constrain can be defined in the scipy.minimize function, but doing it here
    #gives more flexibility to the scipy.minimize function (e.g. more solvers 
    #are allowed).
    w_range = np.arange(len(w))%n_class 
    for i in range(n_class): 
        w[w_range==i] = w[w_range==i] / np.sum(w[w_range==i])

    sol = np.zeros((X.shape[0], n_class))
    for i in range(len(w)):
        sol[:, i % n_class] += X[:, i] * w[i]
    #The quantity to minimize is the log_loss.     
    sc_ll = log_loss(y, sol)
    return sc_ll
项目:qml    作者:quantum13    | 项目源码 | 文件源码
def _features_sel_cv(self, X, Y, splits, model_id, data_id, log, early_stop_cv = None):

        #workaround to set first fold the worst, for using early stop cv
        splits_new_order_temp = []
        for train_indexes, test_indexes in splits:
            splits_new_order_temp += [[train_indexes, test_indexes]]

        splists_new_order = [splits_new_order_temp[2], splits_new_order_temp[1], splits_new_order_temp[3], splits_new_order_temp[0], splits_new_order_temp[4]]

        scores = []
        i = 0
        for train_indexes, test_indexes in splists_new_order:
            i += 1
            X_train = X.loc[train_indexes]
            Y_train = Y.loc[train_indexes][QML_RES_COL]
            X_test = X.loc[test_indexes]
            Y_test = Y.loc[test_indexes][QML_RES_COL]
            res = self.qm.qpredict(model_id, data_id, data=(X_train, Y_train, X_test), Y_test=Y_test, force=True,

            score = log_loss(Y_test, res.astype(np.float64), eps=1e-14)
            log('   {} {}'.format(i, score))

            if early_stop_cv is not None:
                if early_stop_cv(score):
                    scores = [score]
        total_score = sum(scores) / len(scores)

        return total_score
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def eval_func(ytrue, ypredproba):

    return metrics.log_loss(ytrue, ypredproba)
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def xgb_accuracy(ypred, dtrain):
        ytrue = dtrain.get_label().astype(int)

        ypred = np.where(ypred <= 0., 1e-5 , ypred)
        ypred = np.where(ypred >= 1., 1.-1e-5, ypred)

        return 'logloss', metrics.log_loss(ytrue, ypred)
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def eval_func(ytrue, ypredproba):

    return metrics.log_loss(ytrue, ypredproba)
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def xgb_accuracy(ypred, dtrain):
        ytrue = dtrain.get_label().astype(int)

        ypred = np.where(ypred <= 0., 1e-5 , ypred)
        ypred = np.where(ypred >= 1., 1.-1e-5, ypred)

        return 'logloss', metrics.log_loss(ytrue, ypred)
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def logloss(self, label, pred_prob):
        return metrics.log_loss(label, pred_prob)
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def logloss(self, label, pred_prob):
        return metrics.log_loss(label, pred_prob)
项目:audit-log-detection    作者:twosixlabs    | 项目源码 | 文件源码
def on_train_begin(self, model):

        self.validation = {}
        self.validation['epoch'] = [] 
        self.validation['auc'] = []    
        self.validation['time'] = []    
        self.validation['log_loss'] = []    
        self.validation['roc'] = []
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
    n = X.shape[0]
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
    print clf
    feature_index = np.arange(X.shape[1])
    for epoch in range(nb_epoch):
        print "Start epoch:",epoch
        mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
        mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)

        new_index = feature_index[:int(max_features*len(feature_index))]

        for ind_tr, ind_te in skf:
            if ssp.issparse(X):
                X_tr = X[ind_tr].tocsc()[:,new_index]
                X_te = X[ind_te].tocsc()[:,new_index]
                X_tr = X[ind_tr][:,new_index]
                X_te = X[ind_te][:,new_index]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

  , y_tr)
            mf_tr[ind_te] += clf.predict_proba(X_te)
            mf_te += clf.predict_proba(X_test[:,new_index])
            score = log_loss(y_te, mf_tr[ind_te])
            print '\tpred[{}] score:{}'.format(epoch, score)