Python pandas 模块,crosstab() 实例源码

我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用pandas.crosstab()

项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def get_python_guangzhou():
    frame2 = frame[(frame.kd == 'Python') &(frame.city == u'??') ]
    cframe = [v for k, v in frame2.to_dict(orient='index').items()]
    pattern = r'\d{4}-\d{2}-\d{2}'
    for c in cframe:
        if re.match(pattern, c['published']):
            pass
        else:
            c['published'] = datetime.datetime.utcnow().strftime("%Y-%m-%d")
    df = DataFrame(cframe)
    df['published'] = pd.to_datetime(df['published'])
    mask = (df['published'] > '2016-04-01') & (df['published'] <= '2016-05-02')
    dataframe = df.loc[mask]
    jobframe = pd.crosstab(dataframe.experience, frame.salary, margins=True).sort_values(by='All', ascending=False)
    jobframe = jobframe.drop('All', axis=0).drop('All', axis=1)
    pie_chart = pygal.StackedBar()
    pie_chart.title = u'???python?????'
    pie_chart.x_labels = jobframe.index
    for cit, num in jobframe.iteritems():
        pie_chart.add("%s" % (cit), num)
    pie_chart.render_to_file(os.path.dirname(__file__) + '/chart/guangzhou_salary.svg')
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def get_crosstab(self,X,y):
        '''
        ?feature_names?????????????
        X?DataFrame???????Series??????
        y?Series?index???X????????0-1????????
        ?????DataFrame?X?Series??????????X?DataFrame?????????????????DataFrame?
        '''
        if len(X.shape)==1:
            result=pd.crosstab(X,y)
        else:
            result={}
            if self.feature_names is None:
                if isinstance(X,pd.DataFrame):
                    feature_names=list(X.columns)
                else:
                    feature_names=[i for i in range(X.shape[1])]
            else:
                feature_names=self.feature_names
            if isinstance(X,pd.DataFrame):
                for feature in feature_names:
                    result[feature]=pd.crosstab(X[feature],y)
            else:
                for feature in feature_names:
                    result[feature]=pd.crosstab(X[:,feature],y)
        return result
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def make(T):
    log_tr = log[log.order_number_rev>T]

    # dow
    dow  = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_')
    dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_')

    # timezone
    timezone  = pd.crosstab(log_tr.user_id, log_tr.timezone).add_prefix('user_timezone_freq_')
    timezone_ = pd.crosstab(log_tr.user_id, log_tr.timezone, normalize='index').add_prefix('user_timezone_norm_')

    # dow * timezone
    dow_tz  = pd.crosstab(log_tr.user_id, log_tr.dow_tz).add_prefix('user_dow-tz_freq_')
    dow_tz_ = pd.crosstab(log_tr.user_id, log_tr.dow_tz, normalize='index').add_prefix('user_dow-tz_norm_')

    tab = pd.concat([dow, dow_, timezone, timezone_, dow_tz, dow_tz_], axis=1)

    tab.reset_index().to_pickle('../feature/trainT-{}/f103_user.p'.format(T))
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
项目:pretrial-release    作者:natethedrummer    | 项目源码 | 文件源码
def ptr_stats(df):

    df = df[['CASE DISPOSED STATUS','HCJ Booked','MADE Y / N','PRETRIAL STATUS AT DISPOSITION','bail type made simple']] 

    crosstab = pd.crosstab([df['CASE DISPOSED STATUS'],df['HCJ Booked'],df['MADE Y / N'],df['PRETRIAL STATUS AT DISPOSITION']], df['bail type made simple'],  margins=True)

    print(crosstab)

    crosstab.to_csv('ptr_stats.csv')
项目:app-skeleton    作者:rragundez    | 项目源码 | 文件源码
def train_model(split=.25):
    """Tran model based on the iris dataset.

    This will split the iris dataset into train and test set, will
    train a Random Forest CLassifier and fit the trained model to
    the test dataset.
    In addition the confusion matrix and features importance will be
    calculated.

    Args:
        split (float): Fraction of observations in the test dataset.

    Returns:
        RandomForestClassifier: Trained model.
        pandas.DataFrame: Confusion matrix.
        dictionary: Features importance
    """
    iris = load_iris()
    all_data = pd.DataFrame(iris.data, columns=iris.feature_names)
    features = all_data.columns.str.replace('\s+', '_').str.replace('\W+', '')
    all_data['species'] = pd.Categorical.from_codes(iris.target,
                                                    iris.target_names)
    train, test = train_test_split(all_data, test_size=split)
    clf = RandomForestClassifier(n_jobs=1)
    clf.fit(train.drop('species', axis=1), train.species)
    preds = clf.predict(test.drop('species', axis=1))
    conf_matrix = pd.crosstab(test['species'], preds,
                              rownames=['Actual Species'],
                              colnames=['Predicted Species'])
    f_importances = list(zip(train.drop('species', axis=1).columns,
                             clf.feature_importances_))
    return clf, conf_matrix, f_importances, features
项目:tflearn    作者:tflearn    | 项目源码 | 文件源码
def output_confusion_matrix(self, y, y_pred):
        assert y.size == y_pred.size
        print("Actual IDV")
        print(y.value_counts())
        print("Predicted IDV")
        print(y_pred.value_counts())
        print()
        print("Confusion matrix:")
        cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
        print(cmat)
        sys.stdout.flush()
        return cmat

#-----------------------------------------------------------------------------
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Line plot:
        #self.vmax = max(self.vmax, ct.values.max())
        #ct.plot(ax=plt.gca(), color=self.get_palette())
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Stacked area plot:
        #if len(self._groupby) == 2:
            #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
        #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## percentage area plot:
        ## if there is only one grouping variable (the time column),
        ## the cross table produces a Series, not a data frame. It
        ## isn't really very informative to plot it, but we provide
        ## for this special case anyway_
        #if type(ct) == pd.Series:
            #ct = ct.apply(lambda x: 100)
        #else:
            #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
        #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def cal_prob(crosstab):
        '''
        ?????????????????c?????????(N(x=c,y=1)+p)/(N(x=c)+1)?
        crosstab????DataFrame?index????????column?y???0/1??
        ?????????????????????????
        '''
        total=crosstab.sum(axis=0)
        p=total.loc[1]/total.sum()
        N=crosstab.sum(axis=1)+1
        N1=crosstab[1]+p
        N.name=''
        N.index.name=''
        N1.name=''
        N1.index.name=''
        return dict(N1/N)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def cal_woe(crosstab):
        '''
        ???????WOE??????c???WOE???log(r(x=c,y=1)/r(x=c,y=0))?
        ??r(x=c,y=1)=N(x=c,y=1)/N(y=1)??????r(x=c,y=0)=N(x=c,y=0)/N(y=0)??????
        crosstab????DataFrame?index????????column?y???0/1??
        ???????????????????WOE?
        '''
        tmp=crosstab.copy()
        #??????????????
        tmp[tmp==0]=1
        r=tmp/tmp.sum(axis=0)
        result=np.log(r[1]/r[0])
        return dict(result)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def cal_ks(y,y_prob,pos_label=1,return_split=False,decimals=0):
    '''
    ??KS????????
    y: ?????series?????????{0,1}?{-1,1}??
    y_prob: ?????dataframe???????????????????????????????????
            ?????????series?????????dataframe?????
    pos_label: int?????positive?????
    return_split: ??????????
    decimals: ?????????
    ??KS??????????????sklearn???????
    '''
    y=pd.Series(pd.Series(y).values)
    if len(y_prob.shape)==1:
        y_pred=pd.Series(pd.Series(y_prob).values)
    else:
        y_pred=pd.Series(pd.DataFrame(y_prob).iloc[:,1].values)
    Bad=y_pred[y==pos_label]
    Good=y_pred[y!=pos_label]
    ks, pvalue = stats.ks_2samp(Bad.values, Good.values)
    if not return_split:
        return ks
    crossfreq=pd.crosstab(y_pred.round(decimals),y)
    crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
    crossdens['gap'] = abs(crossdens[0] - crossdens[1])
    score_split = crossdens[crossdens['gap'] == crossdens['gap'].max()].index[0]
    return score_split
项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def get_city_experience():
    city_experience = pd.crosstab(frame.city,frame.experience,margins=True).sort_values(by='All',ascending=False)[:11]
    city_education = city_experience.drop('All',axis=0).drop('All',axis=1)
    ce_chart = pygal.Bar()
    ce_chart.title = u'?????????????'
    ce_chart.x_labels = city_education.index
    for i in range(len(list(city_education.T.index))):
        ce_chart.add(city_education.T.index[i], city_education.T.values[i])
    ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_experience.svg')
项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def get_city_phase():
    city_pahse = pd.crosstab(frame.city,frame.phase,margins=True).sort_values(by='All',ascending=False)[:11]
    city_pahse = city_pahse.drop('All',axis=0).drop('All',axis=1)
    funnel_chart = pygal.StackedBar()
    funnel_chart.title = u'??????????????'
    funnel_chart.x_labels = city_pahse.index
    for i in range(len(list(city_pahse.T.index))):
        funnel_chart.add(city_pahse.T.index[i], city_pahse.T.values[i])
    funnel_chart.render_to_file(os.path.dirname(__file__)+'/chart/phase.svg')
项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def get_city_education():
    city_education = pd.crosstab(frame.city,frame.education,margins=True).sort_values(by='All',ascending=False)[:11]
    city_education = city_education.drop('All',axis=0).drop('All',axis=1)
    ce_chart = pygal.Bar()
    ce_chart.title = u'??????????????'
    ce_chart.x_labels = city_education.index
    for i in range(len(list(city_education.T.index))):
        ce_chart.add(city_education.T.index[i], city_education.T.values[i])
    ce_chart.render_to_file(os.path.dirname(__file__) + '/chart/city_edu.svg')
项目:Instacart    作者:KazukiOnodera    | 项目源码 | 文件源码
def multi(uid):
    tmp = log[log.user_id==uid]
    ct = pd.crosstab(tmp.order_number, tmp.product_id).reset_index().set_index('order_number')
    li = []
    for pid in ct.columns:
        streak = 0
        sw_odr = False
        for onb,odr in enumerate(ct[pid].values):
            onb+=1
            if sw_odr == False and odr == 1:
                sw_odr = True
                streak = 1
                li.append([uid, pid, onb, streak])
                continue
            if sw_odr == True:
                if odr == 1 and streak>0:
                    streak += 1
                    li.append([uid, pid, onb, streak])
                elif odr == 1 and streak<=0:
                    streak = 1
                    li.append([uid, pid, onb, streak])
                elif odr == 0 and streak>0:
                    streak = 0
                    li.append([uid, pid, onb, streak])
                elif odr == 0 and streak<=0:
                    streak -= 1
                    li.append([uid, pid, onb, streak])
    return pd.DataFrame(li, columns=['user_id', 'product_id', 'order_number', 'streak'])
项目:Human-Activity-Recognition    作者:servomac    | 项目源码 | 文件源码
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])
项目:xam    作者:MaxHalford    | 项目源码 | 文件源码
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp
项目:CKME136    作者:asterix135    | 项目源码 | 文件源码
def run_knn(trainx, trainy, testx, testy):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(trainx, trainy)
    pred_y = knn.predict(testx)
    print(pd.crosstab(testy, pred_y, rownames=['Actual'],
                      colnames=['Predicted']))
    print('\nAccuracy: ' + str(accuracy_score(testy, pred_y)))
项目:tflearn_wide_and_deep    作者:ichuang    | 项目源码 | 文件源码
def output_confusion_matrix(self, y, y_pred):
        assert y.size == y_pred.size
        print("Actual IDV")
        print(y.value_counts())
        print("Predicted IDV")
        print(y_pred.value_counts())
        print()
        print("Confusion matrix:")
        cmat = pd.crosstab(y_pred, y, rownames=['predictions'], colnames=['actual'])
        print(cmat)
        sys.stdout.flush()
        return cmat

#-----------------------------------------------------------------------------
项目:easyML    作者:aarshayj    | 项目源码 | 文件源码
def calc_model_characteristics(self, performCV=True):
        # Determine key metrics to analyze the classification model. These 
        # are stored in the classification_output series object belonginf to 
        # this class.
        for metric in [self.scoring_metric]+self.additional_display_metrics:
            #Determine for both test and train, except predict:
            for key,data in self.dp.items():
                if key!='predict':  
                    name = '%s_%s'%(metric,key)
                    #Case where probabilities to be passed as arguments
                    if base_classification.metrics_map[metric][2]:
                        self.classification_output[name] = \
                            base_classification.metrics_map[metric][0](
                                data[self.datablock.target],
                                self.predictions_probabilities[key])
                    #case where class predictions to be passed  as arguments
                    else:                                                   
                        self.classification_output[name] = \
                            base_classification.metrics_map[metric][0](
                                data[self.datablock.target],
                                self.predictions_class[key])

                #Determine confusion matrix:
                name = 'ConfusionMatrix_%s'%key
                self.classification_output[name] = pd.crosstab(
                        data[self.datablock.target], 
                        self.predictions_class[key]
                    ).to_string()

        if performCV:
            cv_score = self.KFold_CrossValidation(
                        scoring_metric=self.scoring_metric)
        else:
            cv_score = {
                'mean_error': 0.0, 
                'std_error': 0.0
            }

        self.classification_output['CVMethod'] = \
                                        'KFold - ' + str(self.cv_folds)
        self.classification_output['CVScore_mean'] = cv_score['mean_error']
        self.classification_output['CVScore_std'] = cv_score['std_error']
        self.classification_output['Predictors'] = str(self.predictors)
项目:easyML    作者:aarshayj    | 项目源码 | 文件源码
def printReport(self, printConfusionMatrix, printModelParameters):
        # Print the metric determined in the previous function.

        print("\nModel Report")
        #Outpute the parameters used for modeling
        if printModelParameters:
            print('\nModel being built with the following parameters:')
            print(self.alg.get_params())

        if printConfusionMatrix:
            for key,data in self.dp.items():
                if key!='predict':
                    print("\nConfusion Matrix for %s data:"%key)
                    print(pd.crosstab(
                            data[self.datablock.target], 
                            self.predictions_class[key])
                    )
            print('Note: rows - actual; col - predicted')

        print("\nScoring Metric:")
        for key,data in self.dp.items():
            if key!='predict':
                name = '%s_%s'%(self.scoring_metric,key)
                print("\t%s (%s): %s" % 
                    (
                    self.scoring_metric,
                    key,
                    "{0:.3%}".format(self.classification_output[name])
                    )
                )

        print("\nCV Score for Scoring Metric (%s):"%self.scoring_metric)
        print("\tMean - %f | Std - %f" % (
            self.classification_output['CVScore_mean'],
            self.classification_output['CVScore_std'])
        )

        if self.additional_display_metrics:
            print("\nAdditional Scoring Metrics:")
            for metric in self.additional_display_metrics:
                for key,data in self.dp.items():
                    if key!='predict':
                        name = '%s_%s'%(metric,key)
                        print("\t%s (%s): %s" % (
                            metric,
                            key,
                            "{0:.3%}".format(
                                    self.classification_output[name])
                            )
                        )
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def advanced_scoring_classifiers(probas, actuals, name=None):
    # pandas Series don't play nice here. Make sure our actuals list is indeed a list
    actuals = list(actuals)
    predictions = list(probas)

    print('Here is our brier-score-loss, which is the default value we optimized for while training, and is the value returned from .score() unless you requested a custom scoring metric')
    print('It is a measure of how close the PROBABILITY predictions are.')
    if name != None:
        print(name)

    # Sometimes we will be given "flattened" probabilities (only the probability of our positive label), while other times we might be given "nested" probabilities (probabilities of both positive and negative, in a list, for each item).
    try:
        probas = [proba[1] for proba in probas]
    except:
        pass

    print(format(brier_score_loss(actuals, probas), '.4f'))


    print('\nHere is the trained estimator\'s overall accuracy (when it predicts a label, how frequently is that the correct label?)')
    predicted_labels = []
    for pred in probas:
        if pred >= 0.5:
            predicted_labels.append(1)
        else:
            predicted_labels.append(0)
    print(format(accuracy_score(y_true=actuals, y_pred=predicted_labels) * 100, '.1f') + '%')


    print('\nHere is a confusion matrix showing predictions and actuals by label')
    #it would make sense to use sklearn's confusion_matrix here but it apparently has no labels
    #took this idea instead from: http://stats.stackexchange.com/a/109015
    conf = pd.crosstab(pd.Series(actuals), pd.Series(predicted_labels), rownames=['v Actual v'], colnames=['Predicted >'], margins=True)
    print(conf)


    print('Here is the accuracy of our trained estimator at each level of predicted probabilities')

    # create summary dict
    summary_dict = OrderedDict()
    for num in range(0, 110, 10):
        summary_dict[num] = []

    for idx, proba in enumerate(probas):
        proba = math.floor(int(proba * 100) / 10) * 10
        summary_dict[proba].append(actuals[idx])

    for k, v in summary_dict.items():
        if len(v) > 0:
            print('Predicted probability: ' + str(k) + '%')
            actual = sum(v) * 1.0 / len(v)

            # Format into a prettier number
            actual = round(actual * 100, 0)
            print('Actual: ' + str(actual) + '%')
            print('# preds: ' + str(len(v)) + '\n')

    print('\n\n')
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def test_alex(self):

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []

    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        predicts=out[self.outputs[0]]
        predicts=np.reshape(predicts,(test_num,10))
        confusion_array = np.zeros((class_size), dtype = np.int)
        for i in range(test_num):
        actual.append(class_index)
        for j in range(class_size):    
           if np.max(predicts[i]) == predicts[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1
        #print(confusion_array)
        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    #actual = np.array(actual)
    #predict = np.array(predict)
    #y_actual = pd.Series(actual, name='Actual')
    #y_predict = pd.Series(predict, name='Predicted')
    #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    #print(df_confusion)
    #plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def evaluate(self,metric='cosine'):
    #sample-feature
    X=self.read_imagelist(filelist_sample)
    sample_num=np.shape(X)[0]
    out = self.forward_all(data=X)
    feature1=np.float64(out['deepid'])
    feature1=np.reshape(feature1,(sample_num,feature_size))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []

    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        feature2=np.float64(out['deepid'])
        feature2=np.reshape(feature2,(test_num,feature_size))
        #np.savetxt('feature2.txt', feature2, delimiter=',')
        #mt=pw.pairwise_distances(feature2, feature1, metric=metric)
        mt=pw.cosine_similarity(feature2, feature1)
        false=0
        for i in range(test_num):
        actual.append(class_index)
        for j in range(sample_num):
           if np.max(mt[i]) == mt[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1

        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    actual = np.array(actual)
    predict = np.array(predict)
    y_actual = pd.Series(actual, name='Actual')
    y_predict = pd.Series(predict, name='Predicted')
    df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    print(df_confusion)
    plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
项目:image-classifier    作者:gustavkkk    | 项目源码 | 文件源码
def evaluate2(self,metric='cosine'):
    feature1=np.fromfile('./features/' + model_name +'-features.dat',dtype=np.float64)
    feature1=np.reshape(feature1,(class_size,feature_size))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    class_index = 0
    image_index = 0
    total_count = 0.0
    accept_sum = 0
    actual = []
    predict = []
    for filename in filenames:
        #query-feature
        X=self.read_imagelist(filelist_path + filename + extension)
        test_num=np.shape(X)[0]
        out = self.forward_all(data=X)
        feature2=np.float64(out['deepid'])
        feature2=np.reshape(feature2,(test_num,feature_size))
        #np.savetxt('feature2.txt', feature2, delimiter=',')
        #mt=pw.pairwise_distances(feature2, feature1, metric=metric)
        mt=pw.cosine_similarity(feature2, feature1)
        false=0
        for i in range(test_num):
        actual.append(class_index)
        for j in range(class_size):
           if np.max(mt[i]) == mt[i][j]:
            confusion_array[j] += 1 
            predict.append(j)
        image_index += 1

        total_count += test_num
        accept_sum += confusion_array[class_index]
        class_index += 1

    print 'total:%d' % (round(total_count))
    print 'accept:%d' % (accept_sum)
    print 'reject:%d' % (round(total_count) - accept_sum)
    print 'accuray:%.4f' % (accept_sum / total_count)

    #conf_mat = confusion_matrix(actual,predict)
    #print(conf_mat)
    #actual = np.array(actual)
    #predict = np.array(predict)
    #y_actual = pd.Series(actual, name='Actual')
    #y_predict = pd.Series(predict, name='Predicted')
    #df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
    #print(df_confusion)
    #plot_confusion_matrix(df_confusion)
    return (accept_sum / total_count)

    #process a text file
项目:tableone    作者:tompollard    | 项目源码 | 文件源码
def _create_significance_table(self,data):
        """
        Create a table containing p values for significance tests. Add features of
        the distributions and the p values to the dataframe.
        """

        # list features of the variable e.g. matched, paired, n_expected
        df=pd.DataFrame(index=self.continuous+self.categorical,
            columns=['continuous','nonnormal','min_observed','pval','ptest'])

        df.index.rename('variable', inplace=True)
        df['continuous'] = np.where(df.index.isin(self.continuous),True,False)
        df['nonnormal'] = np.where(df.index.isin(self.nonnormal),True,False)

        # list values for each variable, grouped by groupby levels
        for v in df.index:

            # compute p value
            is_continuous = df.loc[v]['continuous']
            is_categorical = ~df.loc[v]['continuous']
            is_normal = ~df.loc[v]['nonnormal']

            # if continuous, group data into list of lists
            if is_continuous:
                catlevels = None
                grouped_data = []
                for s in self.groupbylvls:
                    lvl_data = data[data[self.groupby]==s].dropna(subset=[v])[v]
                    grouped_data.append(lvl_data.values)
                min_observed = len(min(grouped_data,key=len))
            # if categorical, create contingency table
            elif is_categorical:
                catlevels = sorted(data[v].astype('category').cat.categories)
                grouped_data = pd.crosstab(data[self.groupby],data[v])
                min_observed = grouped_data.sum(axis=1).min()

            # minimum number of observations across all levels
            df.loc[v,'min_observed'] = min_observed

            # compute pvalues
            df.loc[v,'pval'],df.loc[v,'ptest'] = self._p_test(v, 
                grouped_data,is_continuous,is_categorical,
                is_normal,min_observed,catlevels)

        return df
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def draw(self, **kwargs):
        """ Draw time series. """

        def plot_facet(data, color, **kwargs):
            num = []
            date = []
            time = data[self._time_column]
            num = data[self._time_column].apply(self.convert_to_datetime)
            date = data[self._time_column].apply(self.convert_to_timeseries)
            if pd.isnull(num).sum() <= pd.isnull(date).sum():
                data[self._time_column] = num
            else:
                data[self._time_column] = date

            data.dropna(inplace=True)
            if len(self._groupby) == 2:
                ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
                ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
                ct = ct[pd.notnull(ct.index)]
            else:
                ct = pd.crosstab(
                    data[self._time_column],
                    pd.Series([""] * len(self._table[self._time_column]), name=""))

            # percentage area plot:
            if self.percentage:
                # if there is only one grouping variable (the time column), 
                # the cross table produces a Series, not a data frame. It 
                # isn't really very informative to plot it, but we provide 
                # for this special case anyway_
                if type(ct) == pd.Series:
                    ct = ct.apply(lambda x: 100)
                else:
                    ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
                ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
            else:
                if self.area:
                    # Stacked area plot:
                    if len(self._groupby) == 2:
                        self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
                    ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
                else:
                    # Line plot:
                    self.vmax = max(self.vmax, ct.values.max())
                    ct.plot(ax=plt.gca(), color=self.get_palette())

        self.map_data(plot_facet)

        if self.percentage:
            self.g.set(ylim=(0, 100))
        else:
            self.g.set(ylim=(0, self.vmax))
        self.g.set_axis_labels(self.options["label_x_axis"], self.options["label_y_axis"])

        if len(self._groupby) == 2:
            self.add_legend()
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def draw(self):
        """ Draw a heat map. """

        def get_crosstab(data, row_fact,col_fact, row_names, col_names):
            ct = pd.crosstab(data[row_fact], data[col_fact])
            ct = ct.reindex_axis(row_names, axis=0).fillna(0)
            ct = ct.reindex_axis(col_names, axis=1).fillna(0)
            return ct

        def plot(data, color):
            ct = get_crosstab(
                    data,
                    self._groupby[0],
                    self._groupby[1],
                    self._levels[0],
                    self._levels[1])

            sns.heatmap(ct,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                vmax=vmax,
                #ax=plt.gca(),
                linewidths=1)

        if len(self._groupby) < 2:
            # create a dummy cross tab with one dimension containing empty
            # values:
            data_column = self._table[self._groupby[0]].reset_index(drop=True)
            tab = pd.crosstab(
                pd.Series([""] * len(data_column), name=""),
                data_column)
            plot_facet = lambda data, color: sns.heatmap(
                tab,
                robust=True,
                annot=True,
                cbar=False,
                cmap=cmap,
                fmt="g",
                linewidths=1)
        else:
            plot_facet = plot
            vmax = pd.crosstab(
                [self._table[x] for x in [self._row_factor, self._groupby[0]] if x != None],
                [self._table[x] for x in [self._col_factor, self._groupby[1]] if x != None]).values.max()

        cmap = ListedColormap(self.options["color_palette_values"])
        self.map_data(plot_facet)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def plot_ks_cdf(y_true,y_score,pos_label=1,label_map=None,color_map=None,decimals=0,
                xlabel='Score',ylabel='CumSum',fontsize=12,figsize=(18,8),close=True):
    '''
    ??: ??KS???????????????????
    ???: 
    y_true: ?????series?????????{0,1}?{-1,1}??
    y_score: ?????series????????????????????
    pos_label: int?????positive?????
    label_map: ???????????????{0:'Good',1:'Bad'}?
    color_map: ????????????????{0:'g',1:'r'}?
    decimals: ?????????
    xlabel: ??????xlabel?
    ylabel: ??????ylabel?
    fontsize: int??????
    close: ???????
    ???: 
    ????????{'ks': KS??'split': KS??????'fig': ?????????}?
    '''
    if label_map is None:
        label_map={0:'Good',1:'Bad'}
    ks_dict = {}
    y_true=pd.Series(y_true)
    y_score=pd.Series(y_score)
    y_score_dataframe=pd.concat([y_true,y_score],axis=1)
    ks=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=False,decimals=decimals)
    score_split=cal_ks(y_true,y_score_dataframe,pos_label=pos_label,return_split=True,decimals=decimals)

    crossfreq = pd.crosstab(y_score.round(decimals),y_true)
    crossdens = crossfreq.cumsum(axis=0) / crossfreq.sum()
    color=crossdens.columns.map(lambda xx: color_map.get(xx,None))
    crossdens=crossdens.rename(columns=label_map)
    crossdens.columns.name=''
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    crossdens.plot(kind='line',ax=ax,fontsize=fontsize,color=color)
    ax.set_xlabel(xlabel,fontsize=fontsize)
    ax.set_ylabel(ylabel,fontsize=fontsize)
    ax.set_title('CDF Curve (KS=%.2f, SPLIT=%.*f)'%(ks,decimals,score_split),fontsize=fontsize)
    if close:
        plt.close('all')    
    ks_dict['ks'] = ks
    ks_dict['split'] = score_split
    ks_dict['fig'] = fig
    return ks_dict
项目:AlphaPy    作者:ScottFreeLLC    | 项目源码 | 文件源码
def create_crosstabs(model):
    r"""Create cross-tabulations for categorical variables.

    Parameters
    ----------
    model : alphapy.Model
        The model object containing the data.

    Returns
    -------
    model : alphapy.Model
        The model object with the updated feature map.

    """

    logger.info("Creating Cross-Tabulations")

    # Extract model data
    X = model.X_train
    y = model.y_train

    # Extract model parameters

    factors = model.specs['factors']
    target_value = model.specs['target_value']

    # Iterate through columns, dispatching and transforming each feature.

    crosstabs = {}
    for fname in X:
        if fname in factors:
            logger.info("Creating crosstabs for feature %s", fname)
            ct = pd.crosstab(X[fname], y).apply(lambda r : r / r.sum(), axis=1)
            crosstabs[fname] = ct

    # Save crosstabs to the feature map

    model.feature_map['crosstabs'] = crosstabs
    return model


#
# Function get_factors
#
项目:microbiomeHD    作者:cduvallet    | 项目源码 | 文件源码
def concordance(series1, series2, method, nreps=1000):
    """
    Measures the concordance between two pandas Series and returns a pvalue
    and measure of concordance.

    Parameters
    ----------
    series1, series2 : pandas Series
        Series with matching indexes.
    method : str
        ['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
    nreps : int
        number of repititions to build the null. Only needed if method is
        'empirical'

    Returns
    -------
    measure : float
        some sort of measure of concordance (e.g. r for the correlation
        methods, n_observed - mean(n_expected) for empirical, etc)
    p : float
        p value of observed concordance between series1 and series2
    """

    if method == 'fisher':
        # Note: this automatically ignores any bugs which were not present
        # in both series.
        mat = pd.crosstab(series1, series2)
        return fisher_exact(mat)

    elif method == 'spearman':
        return spearmanr(series1, series2)

    elif method == 'kendalltau':
        return kendalltau(series1, series2, nan_policy='omit')

    elif method == 'empirical':
        return empirical_pval(series1, series2, nreps)

    elif method == 'cohen':
        tmp = pd.concat((series1, series2), axis=1).dropna()
        return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan

    else:
        raise ValueError('Unknown concordance method.')
项目:ML-Predictions    作者:ltfschoen    | 项目源码 | 文件源码
def process_clustering(self):
        print("K-Means Clustering in progress...")

        dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]


        if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]:
            return

        # Explore loaded data
        df = self.prediction_data
        target_column = dataset_choice["target_column"]
        affiliation_column = dataset_choice["affiliation_column"]

        centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY
        # Initialise K-Means Clustering Model using specified quantity of clusters (centroids)
        # for training the model using the whole dataset.
        kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1)

        df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O'])
        print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist())

        print("All dtypes: ", dict(df.dtypes))
        print("Any rows null?: ", df.isnull().values.any())
        print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)])

        # Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row
        # to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a
        # cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid)
        # Important Note: Pass only numeric dataframe columns
        clustered_row_distances = kmeans_model.fit_transform(df_numeric)

        # Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column
        # and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party')
        labels = kmeans_model.labels_
        # Show how many are grouped into say Cluster 0
        # print(labels.tolist().count(0))
        # Count quantity of unique Clusters
        print("Clusters total count: %r" % (len(labels.tolist())))
        print("Clusters unique count: %r" % (len(set(labels.tolist()))))
        cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels))

        print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column])))

        if self.prediction_config.PLOT_KMEANS_OUTLIERS == True:
            self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances)

        # Generate new DataFrame column to be used as Target Column for Prediction Algorithms
        # (i.e. to detect which roll call votes were most likely to cause extremism such
        # that Senators would not vote along their own party lines)
        extremism = (clustered_row_distances ** 3).sum(axis=1)
        df["extremism"] = extremism
        df.sort_values("extremism", inplace=True, ascending=False)
        print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10)))
        self.prediction_data.df_listings = df
项目:cohorts    作者:hammerlab    | 项目源码 | 文件源码
def fishers_exact_plot(data, condition1, condition2, ax=None,
                       condition1_value=None,
                       alternative="two-sided", **kwargs):
    """
    Perform a Fisher's exact test to compare to binary columns

    Parameters
    ----------
    data: Pandas dataframe
        Dataframe to retrieve information from

    condition1: str
        First binary column to compare (and used for test sidedness)

    condition2: str
        Second binary column to compare

    ax : Axes, default None
        Axes to plot on

    condition1_value:
        If `condition1` is not a binary column, split on =/!= to condition1_value

    alternative:
        Specify the sidedness of the test: "two-sided", "less"
        or "greater"
    """
    plot = sb.barplot(
        x=condition1,
        y=condition2,
        ax=ax,
        data=data,
        **kwargs
    )

    plot.set_ylabel("Percent %s" % condition2)
    condition1_mask = get_condition_mask(data, condition1, condition1_value)
    count_table = pd.crosstab(data[condition1], data[condition2])
    print(count_table)
    oddsratio, p_value = fisher_exact(count_table, alternative=alternative)
    add_significance_indicator(plot=plot, significant=p_value <= 0.05)
    only_percentage_ticks(plot)

    if alternative != "two-sided":
        raise ValueError("We need to better understand the one-sided Fisher's Exact test")
    sided_str = "two-sided"
    print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str))
    return FishersExactResults(oddsratio=oddsratio,
                               p_value=p_value,
                               sided_str=sided_str,
                               with_condition1_series=data[condition1_mask][condition2],
                               without_condition1_series=data[~condition1_mask][condition2],
                               plot=plot)
项目:human_activity    作者:bfetler    | 项目源码 | 文件源码
def rfFitScore(clf, dftrain, dftrain_y, dftest, dftest_y):
    '''random forest classifier fit and score.
       clf=RandomForestClassifier, dftrain=train data,
       dftrain_y=train data Y, dftest=test data,
       dftest_y=test data Y'''

    clfit = clf.fit(dftrain, dftrain_y['Y'])  # clf.fit(X, y)

    imp = clfit.feature_importances_  # ndarray of 562    
    # clfit.fit_transform( X, y=None )  # returns X_new

    new_y = clfit.predict( dftest )  # returns predicted Y

    test_score = clfit.score( dftest, dftest_y['Y'] )
    print("test score:", test_score)  # clfit.oob_score_  
    if (clf.oob_score):
        print("oob score", clfit.oob_score_)

    # calculate test score by other means
    print("predict True %.3f percent, %d out of %d" % \
      ((100 * sum(dftest_y['Y'] == new_y) / dftest_y.shape[0]), \
       sum(dftest_y['Y'] == new_y), dftest_y.shape[0]))
    print("predict False %.3f percent, %d out of %d" % \
      ((100 * sum(dftest_y['Y'] != new_y) / dftest_y.shape[0]), \
       sum(dftest_y['Y'] != new_y), dftest_y.shape[0]))

#    new_p = clfit.predict_proba( dftest )
#    # probability of each X variable to predict each y class
#    print("test predict probabilities head:\n", new_p[:5])

    # cross table of variable predictions
    ptab = pd.crosstab(dftest_y['Y'], new_y, \
        rownames=['actual'], colnames=['predicted'])
    print("cross table:\n", ptab)

    # accuracy: percent labeled correctly
    # precision: true positives / (true positives + true negatives)
    # recall:    true positives / (true positives + false negatives)
    precision, recall, fbeta, support = prfs(dftest_y['Y'], new_y)
    print("precision", precision, "\nrecall", recall, \
        "\nfbeta", fbeta, "\nsupport", support)

    if (clf.oob_score):
        return test_score, imp, clfit.oob_score_
    else:
        return test_score, imp
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def get_data():
    f_path = "../dataset/logistic_regression/UCLA_dataset.csv"
    df = pd.read_csv(f_path)
    print df.head()

    print df.describe()

    print df.std()

    print pd.crosstab(df['admit'], df['rank'], rownames=['admit'])

    # df.hist()
    # pl.show()

    # dummy_ranks = pd.get_dummies(df['rank'], prefix='rank')
    # print dummy_ranks.head()

    # train_cols = df.columns[1:]
    # lr = sm.Logit(df['admit'], df[train_cols])
    # ret = lr.fit()
    # print ret.summary()

    train, test = train_test_split(df, test_size=0.2)
    train_x, train_y = train[train.columns[1:]], train['admit']
    test_x, test_y = test[test.columns[1:]], test['admit']

    lr = LogisticRegression()
    lr.fit(train_x, train_y)

    y_pred = lr.predict(test_x)
    print accuracy_score(test_y, y_pred)

    rf = RandomForestClassifier(n_jobs=4)
    rf.fit(train_x, train_y)
    Y_pred = rf.predict(test_x)
    cnf_matrix = confusion_matrix(test_y, Y_pred)
    print cnf_matrix

    accuracy_percent = accuracy_score(test_y, Y_pred)
    print "accuracy is: %s%s" % (accuracy_percent, '%')
    recall_percent = recall_score(test_y, Y_pred)
    print "recall is: %s%s" % (recall_percent, '%')