Python scipy.stats 模块,percentileofscore() 实例源码

我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用scipy.stats.percentileofscore()

项目:psp    作者:cmap    | 项目源码 | 文件源码
def percentile_score_single(test_vals, bg_vals):
    """ For each value in test_vals, compute its percentile score compared
    to bg_vals.

    Args:
        test_vals (numpy array)
        bg_vals (numpy array)

    Returns:
        out_score (float)

    """

    # Compute percentile score for each value in test_vals
    percentile_scores = [stats.percentileofscore(bg_vals, test_val, kind="rank") for test_val in test_vals]

    # Take mean of percentile scores
    out_score = np.mean(percentile_scores)

    return out_score
项目:metaseek    作者:ahoarfrost    | 项目源码 | 文件源码
def getMapBins(map_counts, num_bins): #e.g. latlon_map[0]
    #find upper limit of percentile for 0 count (for all data is something like 30th)
    bottom = sp.percentileofscore(map_counts.flatten(), 0.0, kind='weak')
    #define 10 percentile bins, evenly divide rest of bins above 0 count into nine chunks
    step = (100-bottom)/(num_bins-1)
    percentiles = [0]
    for ix in xrange(0,num_bins-1):
        percentiles.append(bottom+(ix*step))
    #find counts that bound each bin
    countRanges = []
    for bin in percentiles:
        countRanges.append(round(np.percentile(map_counts,bin),0))
    countRanges.append(round(np.amax(map_counts),0))
    #define fill colors for each of the bins
    fillColors = []
    for ix in xrange(0,num_bins):
        #the 255*0.8 is the max opacity
        fillColors.append([66, 91, 161, (255*0.8)*(ix/float(num_bins-1))])
    return percentiles, countRanges, fillColors

# Create the actual data to power our map overlay
项目:pyktrader2    作者:harveywwu    | 项目源码 | 文件源码
def DVO(df, w = [0.5, 0.5, 0, 0], N = 2, s = [0.5, 0.5], M = 252):
    ratio = df.close/(df.high * w[0] + df.low * w[1] + df.open * w[2] + df.close * w[3])
    theta = pd.Series(index = df.index)
    dvo = pd.Series(index = df.index, name='DV%s_%s' % (N, M))
    ss = np.array(list(reversed(s)))
    for idx, d in enumerate(ratio.index):
        if idx >= N-1:
            y = ratio[idx-N+1:idx+1].values
            theta[idx] = np.dot(y, ss)
        if idx >= M+N-2:
            ts = theta[idx-(M-1):idx+1]
            dvo[idx] = stats.percentileofscore(ts.values, theta[idx])
    return dvo
项目:metaseek    作者:ahoarfrost    | 项目源码 | 文件源码
def summarizeMap(mapDataFrame):
    latlon  = mapDataFrame[['meta_latitude','meta_longitude']]
    latlon = latlon[pd.notnull(latlon['meta_latitude'])]
    latlon = latlon[pd.notnull(latlon['meta_longitude'])]
    minLat = np.amin(latlon['meta_latitude'])
    maxLat = np.amax(latlon['meta_latitude'])
    minLon = np.amin(latlon['meta_longitude'])
    maxLon = np.amax(latlon['meta_longitude'])
    if len(latlon) > 1:
        latlon_map = np.histogram2d(x=latlon['meta_longitude'],y=latlon['meta_latitude'],bins=[36,18], range=[[minLon, maxLon], [minLat, maxLat]])
    else:
        latlon_map = np.histogram2d(x=[],y=[],bins=[36,18], range=[[-180, 180], [-90, 90]])
    #define latlon map color bin info
    percentiles, countRanges, fillColors = getMapBins(latlon_map[0], num_bins=10)
    # range should be flexible to rules in DatasetSearchSummary
    # latlon_map[0] is the lonxlat (XxY) array of counts; latlon_map[1] is the nx/lon bin starts; map[2] ny/lat bin starts
    lonstepsize = (latlon_map[1][1]-latlon_map[1][0])/2
    latstepsize = (latlon_map[2][1]-latlon_map[2][0])/2
    maxMapCount = np.amax(latlon_map[0])
    map_data = []
    for lon_ix,lonbin in enumerate(latlon_map[0]):
        for lat_ix,latbin in enumerate(lonbin):
            #[latlon_map[2][ix]+latstepsize for ix,latbin in enumerate(latlon_map[0][0])]
            lat = latlon_map[2][lat_ix]+latstepsize
            lon = latlon_map[1][lon_ix]+lonstepsize
            value = latbin
            buffer=0.0001
            #left-bottom, left-top, right-top, right-bottom, left-bottom
            polygon = [[lon-lonstepsize+buffer,lat-latstepsize+buffer], [lon-lonstepsize+buffer,lat+latstepsize-buffer], [lon+lonstepsize-buffer,lat+latstepsize-buffer], [lon+lonstepsize-buffer,lat-latstepsize+buffer], [lon-lonstepsize,lat-latstepsize]]
            bin_ix = np.amax(np.argwhere(np.array(percentiles)<=sp.percentileofscore(latlon_map[0].flatten(), value)))
            fillColor = fillColors[bin_ix]
            map_data.append({"lat":lat,"lon":lon,"count":value,"polygon":polygon, "fillColor":fillColor})
    map_legend_info = {"ranges":countRanges, "fills":fillColors}
    return (map_data,map_legend_info)

# Query Construction Helpers / Data Retrieval
# Based on a rule (field name, comparator and value), add a filter to a query object
# TODO add some better documentation here on what each type is
项目:Waskom_PNAS_2017    作者:WagnerLabPapers    | 项目源码 | 文件源码
def permutation_test(corrmat, tails, mask=None, n=100, seed=None):
    """Permute tail assignments to generate null distribution."""
    rs = np.random.RandomState(seed)
    corrs_real = tail_correlations(corrmat, tails, mask)
    corrs_null = []
    for _ in xrange(n):
        perm_tails = rs.permutation(tails)
        corrs_null.append(tail_correlations(corrmat, perm_tails, mask))
    diff_real = np.subtract(*corrs_real)
    diff_null = np.subtract(*zip(*corrs_null))
    pctile = stats.percentileofscore(diff_null, diff_real)
    return pctile
项目:Waskom_PNAS_2017    作者:WagnerLabPapers    | 项目源码 | 文件源码
def percentile_score(null, real):
    """Vectorized function for computing percentile of score."""
    if np.isscalar(real):
        return stats.percentileofscore(null, real)

    percentiles = []
    assert len(null) == len(real)
    for null_i, real_i in zip(null, real):
        percentiles.append(stats.percentileofscore(null_i, real_i, "mean"))
    assert len(percentiles) == len(real)

    return np.array(percentiles)
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def calc_regression(self, label):
        """
        Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares.

        Regression results are stored in ``'/main/label/scores'``

        """
        if self.check_store("/main/{}/scores".format(label)):
            return
        elif "/main/{}/scores".format(label) in self.store.keys():
            # need to remove the current keys because we are using append
            self.store.remove("/main/{}/scores".format(label))

        logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name})
        # append is required because it takes the "min_itemsize" argument, and put doesn't
        longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max()
        chunk = 1
        if self.scoring_method == "WLS":
            for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize):
                logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        elif self.scoring_method == "OLS":
            for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize):
                logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        else:
            raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))

        # need to read from the file, calculate percentiles, and rewrite it
        logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name})
        data = self.store['/main/{}/scores'.format(label)]
        data['score'] = data['slope']
        data['SE'] = data['SE_slope']
        data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']]
        data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns
        self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
项目:nelpy    作者:nelpy    | 项目源码 | 文件源码
def score_hmm_events(bst, k_folds=None, num_states=30, n_shuffles=5000, shuffle='row-wise', verbose=False):
    """scores all sequences in the entire bst"""
    if k_folds is None:
        k_folds = 5

    if shuffle == 'row-wise':
        rowwise = True
    elif shuffle == 'col-wise':
        rowwise = False
    else:
        raise ValueError("tmat must be either 'row-wise' or 'col-wise'")

    X = [ii for ii in range(bst.n_epochs)]

    scores_hmm = np.zeros(bst.n_epochs)
    scores_hmm_shuffled = np.zeros((bst.n_epochs, n_shuffles))

    for kk, (training, validation) in enumerate(k_fold_cross_validation(X, k=k_folds)):
        if verbose:
            print('  fold {}/{}'.format(kk+1, k_folds))

        PBEs_train = bst[training]
        PBEs_test = bst[validation]

        # train HMM on all training PBEs
        hmm = PoissonHMM(n_components=num_states, random_state=0, verbose=False)
        hmm.fit(PBEs_train)

        # reorder states according to transmat ordering
        transmat_order = hmm.get_state_order('transmat')
        hmm.reorder_states(transmat_order)

        # compute scores_hmm (log likelihoods) of validation set:
        scores_hmm[validation] = hmm.score(PBEs_test)

        hmm_shuffled = copy.deepcopy(hmm)
        for nn in range(n_shuffles):
            # shuffle transition matrix:
            if rowwise:
                hmm_shuffled.transmat_ = shuffle_transmat(hmm_shuffled.transmat)
            else:
                hmm_shuffled.transmat_ = shuffle_transmat_Kourosh_breaks_stochasticity(hmm_shuffled.transmat)
                hmm_shuffled.transmat_ = hmm_shuffled.transmat / np.tile(hmm_shuffled.transmat.sum(axis=1), (hmm_shuffled.n_components, 1)).T

            # score validation set with shuffled HMM
            scores_hmm_shuffled[validation, nn] = hmm_shuffled.score(PBEs_test)

    n_scores = len(scores_hmm)
    scores_hmm_percentile = np.array([stats.percentileofscore(scores_hmm_shuffled[idx], scores_hmm[idx], kind='mean') for idx in range(n_scores)])

    return scores_hmm, scores_hmm_shuffled, scores_hmm_percentile
项目:time_seires_prediction_using_lstm    作者:CasiaFan    | 项目源码 | 文件源码
def qq_plot(self, df_samp, df_clu):
        """
        :param df1: interval df of enterprise a. The column name should be the enterprise id
        :param df2: interval df of enterprise b. The column name should be the enterprise id
        :return: slope, intercept and total fit error of fitted regression line
        """
        # use longer list as reference distribution
        outdir = self.output_dir + "/qq-plot"
        # make output directory if not exists
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        ref = np.asarray(df_clu)
        samp = np.asarray(df_samp)
        ref_id = df_clu.columns
        samp_id = df_samp.columns
        print "Start drawing Q-Q plot using data from sample {} and cluster {}.".format(samp_id, ref_id)
        # theoretical quantiles
        samp_pct_x = np.asarray([percentileofscore(ref, x) for x in samp])
        # sample quantiles
        samp_pct_y = np.asarray([percentileofscore(samp, x) for x in samp])
        # calculate the error from real percentiles to predicted percentiles: as same as mean squared error
        pct_error = np.sum(np.power(samp_pct_y - samp_pct_x, 2)) / (2 * len(samp_pct_x))
        # estimated linear regression model
        p = np.polyfit(samp_pct_x, samp_pct_y, 1)
        regr = LinearRegression()
        model_x = samp_pct_x.reshape(len(samp_pct_x), 1)
        model_y = samp_pct_y.reshape(len(samp_pct_y), 1)
        regr.fit(model_x, model_y)
        r2 = regr.score(model_x, model_y)
        if p[1] > 0:
            p_function = "y= {} x + {}, r-square = {}".format(p[0], p[1], r2)
        elif p[1] < 0:
            p_function = "y= {} x - {}, r-square = {}".format(p[0], -p[1], r2)
        else:
            p_function = "y= {} x, r-square = {}".format(p[0], r2)
        print "The fitted linear regression model in Q-Q plot using data from enterprises {} and cluster {} is {}".format(samp_id, ref_id, p_function)
        # plot q-q plot
        x_ticks = np.arange(0, 100, 20)
        y_ticks = np.arange(0, 100, 20)
        plt.scatter(x=samp_pct_x, y=samp_pct_y, color='blue')
        plt.xlim((0, 100))
        plt.ylim((0, 100))
        # add fit regression line
        plt.plot(samp_pct_x, regr.predict(model_x), color='red', linewidth=2)
        # add 45-degree reference line
        plt.plot([0, 100], [0, 100], linewidth=2)
        plt.text(10, 70, p_function)
        plt.xticks(x_ticks, x_ticks)
        plt.yticks(y_ticks, y_ticks)
        plt.xlabel('cluster quantiles - id: {}'.format(ref_id))
        plt.ylabel('sample quantiles - id: {}'.format(samp_id))
        plt.title('{} VS {} Q-Q plot'.format(ref_id, samp_id))
        outfile = "{}/enterprise-{}-VS-cluster-{}.qqplot.png".format(outdir, samp_id, ref_id)
        plt.savefig(outfile)
        print "Plotting Q-Q plot done! The plot is stored at {}.".format(outfile)
        plt.close()
        return p[0], p[1], pct_error