Python scipy.stats 模块,scoreatpercentile() 实例源码

我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用scipy.stats.scoreatpercentile()

项目:score_card_base_python    作者:zzstrwolf    | 项目源码 | 文件源码
def discrete(self, x, bin=5):
        #res = np.array([0] * x.shape[-1], dtype=int)
        #?????????????????????WOE?????????????<=?WOE??
        x_copy = pd.Series.copy(x)
        x_copy = x_copy.astype(str)
        #x_copy = x_copy.astype(np.str_)
        #x_copy = x
        x_gt0 = x[x>=0]
        #if x.name == 'TD_PLTF_CNT_1M':
            #bin = 5
            #x_gt0 = x[(x>=0) & (x<=24)]

        for i in range(bin):
            point1 = stats.scoreatpercentile(x_gt0, i * (100.0/bin))
            point2 = stats.scoreatpercentile(x_gt0, (i + 1) * (100.0/bin))
            x1 = x[(x >= point1) & (x <= point2)]
            mask = np.in1d(x, x1)
            #x_copy[mask] = i + 1
            x_copy[mask] = '%s-%s' % (point1,point2)
            #x_copy[mask] = point1
            #print x_copy[mask]
            #print x
        #print x
        return x_copy
项目:score_card_base_python    作者:zzstrwolf    | 项目源码 | 文件源码
def grade(self, x, bin=5):
        #res = np.array([0] * x.shape[-1], dtype=int)
        #?????????????????????WOE?????????????<=?WOE??
        x_copy = np.copy(x)
        #x_copy = x_copy.astype(str)
        #x_copy = x_copy.astype(np.str_)
        #x_copy = x
        x_gt0 = x[x>=0]

        for i in range(bin):
            point1 = stats.scoreatpercentile(x_gt0, i * (100.0/bin))
            point2 = stats.scoreatpercentile(x_gt0, (i + 1) * (100.0/bin))
            x1 = x[(x >= point1) & (x <= point2)]
            mask = np.in1d(x, x1)
            #x_copy[mask] = i + 1
            x_copy[mask] = i + 1
            #x_copy[mask] = point1
            #print x_copy[mask]
            #print x
            print point1,point2
        #print x
        return x_copy
项目:lexdecomp    作者:mcrisc    | 项目源码 | 文件源码
def print_stats(data):
    data = np.array(data)
    desc = stats.describe(data)
    print('# of observations:', desc.nobs)
    print('min: %d\nmax: %d' % desc.minmax)
    print('mean: %.1f' % desc.mean)
    # print('variance: %.1f' % desc.variance)
    print('stdev: %.1f' % math.sqrt(desc.variance))

    print('percentiles')
    for p in PERCENTILES:
        print('%6.2f' % p, '  ', end='')
    print()
    for p in stats.scoreatpercentile(data, PERCENTILES):
        print('%6d' % p, '  ', end='')
    print()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def __call__(self, y, pred, sample_weight=None):
        pred = pred.ravel()
        diff = y - pred
        gamma = self.gamma
        if gamma is None:
            if sample_weight is None:
                gamma = stats.scoreatpercentile(np.abs(diff), self.alpha * 100)
            else:
                gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100)

        gamma_mask = np.abs(diff) <= gamma
        if sample_weight is None:
            sq_loss = np.sum(0.5 * diff[gamma_mask] ** 2.0)
            lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2.0))
            loss = (sq_loss + lin_loss) / y.shape[0]
        else:
            sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask] ** 2.0)
            lin_loss = np.sum(gamma * sample_weight[~gamma_mask] *
                              (np.abs(diff[~gamma_mask]) - gamma / 2.0))
            loss = (sq_loss + lin_loss) / sample_weight.sum()
        return loss
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        # Cater for NaNs
        if self.percentile == 100:
            return np.ones(len(self.scores_), dtype=np.bool)
        elif self.percentile == 0:
            return np.zeros(len(self.scores_), dtype=np.bool)

        scores = _clean_nans(self.scores_)
        treshold = stats.scoreatpercentile(scores,
                                           100 - self.percentile)
        mask = scores > treshold
        ties = np.where(scores == treshold)[0]
        if len(ties):
            max_feats = int(len(scores) * self.percentile / 100)
            kept_ties = ties[:max_feats - mask.sum()]
            mask[kept_ties] = True
        return mask
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def iqr(a):
    """
    Calculate the IQR for an array of numbers.
    """
    a = np.asarray(a)
    q1 = stats.scoreatpercentile(a, 25)
    q3 = stats.scoreatpercentile(a, 75)
    return q3 - q1
项目:lap    作者:gatagat    | 项目源码 | 文件源码
def get_sparse_int(sz, rng, sparsity, hard=True, seed=1299821):
    np.random.seed(seed)
    cost = np.random.randint(1, rng+1, size=(sz, sz))
    if hard is True:
        cost = make_hard(cost, 0, rng)
    mask = np.random.rand(sz, sz)
    thresh = scoreatpercentile(
            mask.flat, max(0, (sparsity - sz/float(sz*sz)) * 100.))
    mask = mask < thresh
    # Make sure there exists a solution.
    row = np.random.permutation(sz)
    col = np.random.permutation(sz)
    mask[row, col] = True
    return cost, mask
项目:Climate_analysis    作者:MitchellBlack    | 项目源码 | 文件源码
def calc_quantiles(vals):
     quantiles = []
     for i in range(1, 100):
         quantiles.append(stats.scoreatpercentile(vals.flatten(), i))
     return quantiles
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def Denoise(data):
        """
        This function implements the denoising given in the url below:
        http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4607982&tag=1

        with title "Astronomical Spectra Denoising Based on Simplifed SURE-LET Wavelet Thresholding"

        The data should be a kglib.utils.DataStructures.xypoint instance.
        """
        y, boolarr = mlpy.wavelet.pad(data.y)
        WC = mlpy.wavelet.dwt(y, 'd', 10, 0)
        # Figure out the unknown parameter 'a'
        sum1 = 0.0
        sum2 = 0.0
        numlevels = int(np.log2(WC.size))
        start = 2 ** (numlevels - 1)
        median = np.median(WC[start:])
        sigma = np.median(np.abs(WC[start:] - median)) / 0.6745
        for w in WC:
            phi = w * np.exp(-w ** 2 / (12.0 * sigma ** 2))
            dphi = np.exp(-w ** 2 / (12.0 * sigma ** 2)) * (1 - 2 * w ** 2 / (12 * sigma ** 2) )
            sum1 += sigma ** 2 * dphi
            sum2 += phi ** 2
        a = -sum1 / sum2

        # Adjust all wavelet coefficients
        WC = WC + a * WC * np.exp(-WC ** 2 / (12 * sigma ** 2))

        # Now, do a soft threshold
        threshold = scoreatpercentile(WC, 80.0)
        WC[np.abs(WC) <= threshold] = 0.0
        WC[np.abs(WC) > threshold] -= threshold * np.sign(WC[np.abs(WC) > threshold])

        #Transform back
        y2 = mlpy.wavelet.idwt(WC, 'd', 10)
        data.y = y2[boolarr]
        return data


    # Kept for legacy support, since I was using Denoise3 in several codes in the past.
项目:gamtools    作者:pombo-lab    | 项目源码 | 文件源码
def filter_data(x, percentile, no_zeros=True):
    """Remove data from an array which is below a certain
    percentile value. Optionally, if no_zeros is specified,
    also remove any zeros from the array.

    If removing values would result in returning an empty array,
    do nothing.

    :param x: Output values are taken from this array
    :type x: :class:`~numpy.ndarray`
    :param float percentile: Percentile at which to remove values \
            (e.g. if percentile=95.0, only the top 5% of values \
            are retained).
    :param bool no_zeros: If True, also discard any values equal \
            to zero from the output array.
    :returns: New array contining values from x that pass the filter.
    """

    percentile_score = scoreatpercentile(x, percentile)
    less_than_percentile = list(x < percentile_score)

    if no_zeros:
        not_a_zero = x > 0

        # only keep points which are both less than percentile AND not a zero
        points_to_keep = list(map(all, list(zip(less_than_percentile, not_a_zero))))

    else:
        points_to_keep = less_than_percentile

    out_data = x[points_to_keep]

    if out_data.size:

        return out_data

    if no_zeros:

        return x[not_a_zero]

    return x
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def atomic_benchmark_estimator(estimator, X_test, verbose=False):
    """Measure runtime prediction of each instance."""
    n_instances = X_test.shape[0]
    runtimes = np.zeros(n_instances, dtype=np.float)
    for i in range(n_instances):
        instance = X_test[[i], :]
        start = time.time()
        estimator.predict(instance)
        runtimes[i] = time.time() - start
    if verbose:
        print("atomic_benchmark runtimes:", min(runtimes), scoreatpercentile(
            runtimes, 50), max(runtimes))
    return runtimes
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose):
    """Measure runtime prediction of the whole input."""
    n_instances = X_test.shape[0]
    runtimes = np.zeros(n_bulk_repeats, dtype=np.float)
    for i in range(n_bulk_repeats):
        start = time.time()
        estimator.predict(X_test)
        runtimes[i] = time.time() - start
    runtimes = np.array(list(map(lambda x: x / float(n_instances), runtimes)))
    if verbose:
        print("bulk_benchmark runtimes:", min(runtimes), scoreatpercentile(
            runtimes, 50), max(runtimes))
    return runtimes
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def n_feature_influence(estimators, n_train, n_test, n_features, percentile):
    """
    Estimate influence of the number of features on prediction time.

    Parameters
    ----------

    estimators : dict of (name (str), estimator) to benchmark
    n_train : nber of training instances (int)
    n_test : nber of testing instances (int)
    n_features : list of feature-space dimensionality to test (int)
    percentile : percentile at which to measure the speed (int [0-100])

    Returns:
    --------

    percentiles : dict(estimator_name,
                       dict(n_features, percentile_perf_in_us))

    """
    percentiles = defaultdict(defaultdict)
    for n in n_features:
        print("benchmarking with %d features" % n)
        X_train, y_train, X_test, y_test = generate_dataset(n_train, n_test, n)
        for cls_name, estimator in estimators.items():
            estimator.fit(X_train, y_train)
            gc.collect()
            runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False)
            percentiles[cls_name][n] = 1e6 * scoreatpercentile(runtimes,
                                                               percentile)
    return percentiles
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y, sample_weight=None):
        if sample_weight is None:
            self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0)
        else:
            self.quantile = _weighted_percentile(y, sample_weight,
                                                 self.alpha * 100.0)
项目:antares    作者:CONABIO    | 项目源码 | 文件源码
def calculate_decision(histogram, bins):
    bins = bins[0:numpy.size(bins)-1]
    thresh_above = histogram[numpy.where(bins>2)]
    thresh_below = histogram[numpy.where(bins<=2)]
    thresh_above = numpy.sort(thresh_above)
    lower_quartile = stats.scoreatpercentile(thresh_above, 25)
    upper_quartile = stats.scoreatpercentile(thresh_above, 75)
    outlier = 2 * upper_quartile - lower_quartile
    decision = bool(numpy.size(thresh_above[numpy.where(thresh_below > outlier)]))
    return decision
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def _select_features(self, all_scores, all_pvalues, feature_names):
        """This function selects the top ``percentile`` of
        features from the F-scores.

        Parameters
        ----------

        all_scores : np.ndarray (float)
            The scores

        all_pvalues : np.ndarray (float)
            The p-values

        feature_names : array_like (str)
            The list of names that are eligible for drop

        Returns
        -------

        list : the features to drop
        """
        percentile = self.percentile

        # compute which features to keep or drop
        if percentile == 100:
            return []
        elif percentile == 0:
            return feature_names
        else:
            # adapted from sklearn.feature_selection.SelectPercentile
            all_scores = _clean_nans(all_scores)
            thresh = stats.scoreatpercentile(all_scores, 100 - percentile)

            mask = all_scores > thresh
            ties = np.where(all_scores == thresh)[0]
            if len(ties):
                max_feats = int(len(all_scores) * percentile / 100)
                kept_ties = ties[:max_feats - mask.sum()]
                mask[kept_ties] = True

            # inverse, since we're recording which features to DROP, not keep
            mask = np.asarray(~mask)

            # now se the drop as the inverse mask
            return (np.asarray(feature_names)[mask]).tolist()
项目:CombinedOneClass    作者:drkatnz    | 项目源码 | 文件源码
def fit(self,X,y=None):
        #create the data generators
        self.generators = [None] * X.shape[1]
        for col in xrange(X.shape[1]):
            if(self.discrete_threshold > 1):
                discrete_gen = discrete.DiscreteGenerator(X[:,col])
                if(discrete_gen.total_keys < self.discrete_threshold):
                    generator = discrete_gen
                else:
                    mean = np.mean(X[:,col])
                    stddev = np.std(X[:,col])           
                    if(stddev == 0):
                        generator = abstract.DummyGenerator(mean)
                    else:
                        generator = gaussian.GaussianGenerator(mean,stddev,self.random_state)
            else:
                mean = np.mean(X[:,col])
                stddev = np.std(X[:,col])           
                if(stddev == 0):
                    generator = abstract.DummyGenerator(mean)
                else:
                    generator = gaussian.GaussianGenerator(mean,stddev,self.random_state)
            self.generators[col] = generator


        #generate data        
        totalInstances = len(X) / (1 - self.proportion_generated)
        generated_len = int(totalInstances - len(X))
        generated = [None] * generated_len
        for i in xrange(generated_len):
            row = [None] * X.shape[1]
            for col in xrange(X.shape[1]):
                row[col] = self.generators[col].generate()
                generated[i] = row


        #work out the threshold of prob(X|C) using cross validation
        skf = StratifiedKFold(n_splits=self.cv_folds,\
            random_state=self.random_state, shuffle=True)

        newX = np.vstack((X,generated))
        newY = np.hstack((np.ones(len(X)),np.zeros(len(X))))

        thresholds = [None] * self.cv_folds
        for i, (train_indices, test_indices) in enumerate(skf.split(newX,newY)):
            if(~self.density_only):
                #only train if you need to!
                self.base_classifier.fit(newX[train_indices], newY[train_indices])

            probabilities = self._get_probabilities(newX[test_indices])                       
            thresholds[i] = stats.scoreatpercentile(probabilities, 100 * self.contamination)

        self.threshold = np.mean(thresholds)

        #retrain on all the data
        if(~self.density_only):
            self.base_classifier.fit(newX,newY)
项目:augur    作者:nextstrain    | 项目源码 | 文件源码
def clock_filter(self, root_seq=None, n_iqd=3, max_gaps = 1.0, plot=False):
        '''
        remove sequences form the set that are that evolve much faster or slower
        compared the majority. Regions with predominantly gaps can be removed since
        this can skew the evolutionary rates.
        '''
        if root_seq is None: # use consensus
            af = calc_af(self.aln, nuc_alpha)
            root_seq = np.fromstring(nuc_alpha, 'S1')[af.argmax(axis=0)]
        if type(root_seq)==str and root_seq in self.sequence_lookup:
            root_seq = np.array(self.sequence_lookup[root_seq])
        if max_gaps<1.0:
            af=calc_af(self.aln, nuc_alpha)
            good_pos = af[nuc_alpha.index('-')]<max_gaps
        else:
            good_pos = np.ones(self.aln.get_alignment_length(), dtype=bool)
        date_vs_distance = {}
        # self.reference_aln = None already set at alignment step
        for seq in self.aln:
            date_vs_distance[seq.id] = (seq.attributes['num_date'],
                np.mean((np.array(seq)!=root_seq)[(np.array(seq)!='-')&(root_seq!='-')&good_pos]))
            # if seq.id==self.reference.id:
            #     self.reference_aln = seq
        date_vs_distance_array=np.array(date_vs_distance.values())
        from scipy.stats import linregress, scoreatpercentile
        slope, intercept, rval, pval, stderr = linregress(date_vs_distance_array[:,0], date_vs_distance_array[:,1])
        print("distance vs time regression:",slope)
        residuals = (intercept + slope*date_vs_distance_array[:,0]) - date_vs_distance_array[:,1]
        IQD = scoreatpercentile(residuals, 75) - scoreatpercentile(residuals,25)
        if plot:
            import matplotlib.pyplot as plt
            plt.ion()
            plt.scatter(date_vs_distance_array[:,0], date_vs_distance_array[:,1], c='g')
            bad_points = abs(intercept+slope*date_vs_distance_array[:,0] - date_vs_distance_array[:,1])>n_iqd*IQD
            plt.scatter(date_vs_distance_array[bad_points,0], date_vs_distance_array[bad_points,1], c='r')


        print("before clock filter:",len(self.aln))
        tmp = {seq.id:seq for seq in self.aln
                if abs(intercept+slope*date_vs_distance[seq.id][0] - date_vs_distance[seq.id][1])<n_iqd*IQD}
        if self.reference.id not in tmp and self.reference.reference_in_dataset:
            self.log.notify('adding reference again after clock filter')
            tmp[self.reference.id] = self.reference_aln
        self.aln = MultipleSeqAlignment(tmp.values())
        print("after clock filter:",len(self.aln))
项目:deep_ocr    作者:JinpengLI    | 项目源码 | 文件源码
def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90, debug=0):
    '''# estimate low and high thresholds
    ignore this much of the border for threshold estimation, default: %(default)s
    scale for estimating a mask over the text region, default: %(default)s
    lo percentile for black estimation, default: %(default)s
    hi percentile for white estimation, default: %(default)s
    '''
    d0,d1 = flat.shape
    o0,o1 = int(bignore*d0), int(bignore*d1)
    est = flat[o0:d0-o0,o1:d1-o1]
    if escale>0:
        # by default, we use only regions that contain
        # significant variance; this makes the percentile
        # based low and high estimates more reliable
        e = escale
        v = est - filters.gaussian_filter(est, e*20.0)
        if debug:
            plt.clf()
            plt.title("first gaussian_filter")
            plt.imshow(v)
            raw_input("PRESS ANY KEY TO CONTINUE.")
        v = filters.gaussian_filter(v**2, e*20.0)**0.5
        if debug:
            plt.clf()
            plt.title("second gaussian_filter")
            plt.imshow(v)
            raw_input("PRESS ANY KEY TO CONTINUE.")
        v = (v > 0.3 * np.amax(v))
        if debug:
            plt.clf()
            plt.title("binarization")
            plt.imshow(v)
            raw_input("PRESS ANY KEY TO CONTINUE.")
        v = morphology.binary_dilation(v, structure=np.ones((int(e*50), 1)))
        v = morphology.binary_dilation(v, structure=np.ones((1, int(e*50))))
        if debug:
            plt.clf()
            plt.title("morphology dilation")
            plt.imshow(v)
            raw_input("PRESS ANY KEY TO CONTINUE.")
        est = est[v]
    lo = stats.scoreatpercentile(est.ravel(),lo)
    hi = stats.scoreatpercentile(est.ravel(),hi)
    return lo, hi