def _p_test(self,v,grouped_data,is_continuous,is_categorical,
            pval=np.nan,ptest='Not tested'):
        Compute p value

        # do not test if any sub-group has no observations
        if min_observed == 0:
            warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))
            return pval,ptest

        # continuous
        if is_continuous and is_normal:
            # normally distributed
            ptest = 'One-way ANOVA'
            test_stat, pval = stats.f_oneway(*grouped_data)
        elif is_continuous and not is_normal:
            # non-normally distributed
            ptest = 'Kruskal-Wallis'
            test_stat, pval = stats.kruskal(*grouped_data)
        # categorical
        elif is_categorical:
            # default to chi-squared
            ptest = 'Chi-squared'
            chi2, pval, dof, expected = stats.chi2_contingency(grouped_data)
            # if any expected cell counts are < 5, chi2 may not be valid
            # if this is a 2x2, switch to fisher exact
            if expected.min() < 5:
                if grouped_data.shape == (2,2):
                    ptest = 'Fisher''s exact'
                    oddsratio, pval = stats.fisher_exact(grouped_data)
                    ptest = 'Chi-squared (warning: expected count < 5)'
                    warnings.warn('No p-value was computed for {} due to the low number of observations.'.format(v))

        return pval,ptest
def test_bin_fisher(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
    if intv_bin_ip.shape[0] != 1:
        raise Exception('Fisher exact test does not deal with replicates.')
    intv_counter = intv_bin_ip.shape[1]
    assert intv_counter == intv_bin_con.shape[1]
    binscore = np.empty(intv_counter)
    binsignal = np.empty(intv_counter)
    ip_sum = np.sum(intv_bin_ip[0,])
    con_sum = np.sum(intv_bin_con[0,])
    for i in range(intv_counter):
        this_ip = intv_bin_ip[0, i]
        others_ip = ip_sum - this_ip
        this_con = intv_bin_con[0, i]
        others_con = con_sum - this_con
        if this_ip == 0:
            binsignal[i], binscore[i] = np.nan, 1.0
        _, binscore[i] = fisher_exact([[this_ip, others_ip], [this_con, others_con]], alternative='greater')
        if with_control:
            binsignal[i] = this_ip/others_ip / this_con*others_con
            binsignal[i] = this_ip

    adj = multipletests(binscore, alpha=0.05, method=correction_method)
    binscore_adj = adj[1]
    return binsignal, binscore_adj
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def get_tf_associations(self, test):
        # test = {not dissociated,associated}
        tf_set = set()
        # this is the set in which all Term - Function pairs will be contained
        # that cannot be dissociated (i.e, for which we do not know for sure that
        # they are not associated) - done with Fisher Exact tests
        for onto in set(self.ontological):
            if onto not in ['body','thing']: continue
            d_onto =[self.ontological == onto]
            for li in range(30):
                terms = set([w for dd in d_onto for w in dd[li]])
                for term in terms:
                    for annot in set(self.annotation):
                        valid = False
                        if annot == 'UF': continue
                        d_onto_annot =[(self.ontological == onto) * (self.annotation == annot)]
                        aa = len([t for t in d_onto_annot if term in t[li]]) # + term + function
                        ab = len(d_onto_annot) - aa # - term + function
                        ba = len([t for t in d_onto if term in t[li]]) - aa # + term - function
                        bb = len(d_onto) - (aa + ab + ba) # - term - function
                        if test == 'not dissociated' and fisher_exact([[aa,ab],[ba,bb]],'less')[1] > .05:
                            valid = True
                        if test == 'associated' and fisher_exact([[aa,ab],[ba,bb]],'greater')[1] < .05:
                            valid = True
                        # if aa > 0: print('%s,%d,%s,%s,%r,%d,%d,%d' % (onto,li,term,annot,valid,aa,ba,ab))
        return tf_set
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def _get_fisher_scores_from_counts(self, cat_word_counts, not_cat_word_counts):
        cat_not_word_counts = cat_word_counts.sum() - cat_word_counts
        not_cat_not_word_counts = not_cat_word_counts.sum() - not_cat_word_counts

        def do_fisher_exact(x):
            return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater')

        odds_ratio, p_values = np.apply_along_axis(
            np.array([cat_word_counts, cat_not_word_counts, not_cat_word_counts, not_cat_not_word_counts]))
        return odds_ratio, p_values
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def run_test(q1_pos, q2_pos, q1_neg,q2_neg):
    this method takes four parallel arrays representing a 2X2 contingency table.
    the length of these parallel arrays denotes the number of tests that will be run,
    either a chi-squared test or an fisher-exact test are run, epending whether the requriments for a
    reliable chi-squared test are satisifed.

    Bonferroni correction is then applied by adjusting the p-values for all of the tests

    We return two parellel arrays, the first array is the p-values of for the tests, the second array is the test value
    e.g. the chi-squared value or the fisher-exact oddsratio.


    input = [q1_pos, q2_pos, q1_neg,q2_neg]
    n = len(input[0])
    if not all(len(x) == n for x in  input):
        raise BaseException ("length of input lists must be of same length")

    pvalues = []
    test_values = []

    for i in range(0,n):

        obs = np.array([ [input[0][i],input[1][i]],[input[2][i],input[3][i]] ])
        if useFisherExact(obs):
            p = fisher_exact(obs)[1]
            t = fisher_exact(obs)[0]
            p = chi2_contingency(obs)[1]
            t = chi2_contingency(obs)[0]

    #applying Bonferroni correction
    adjustedPValues = [ float(p)/float(n) for p in pvalues]
    return [adjustedPValues, test_values]
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments:
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue
def concordance(series1, series2, method, nreps=1000):
    Measures the concordance between two pandas Series and returns a pvalue
    and measure of concordance.

    series1, series2 : pandas Series
        Series with matching indexes.
    method : str
        ['fisher', 'spearman', 'kendalltau', 'empirical', 'cohen']
    nreps : int
        number of repititions to build the null. Only needed if method is

    measure : float
        some sort of measure of concordance (e.g. r for the correlation
        methods, n_observed - mean(n_expected) for empirical, etc)
    p : float
        p value of observed concordance between series1 and series2

    if method == 'fisher':
        # Note: this automatically ignores any bugs which were not present
        # in both series.
        mat = pd.crosstab(series1, series2)
        return fisher_exact(mat)

    elif method == 'spearman':
        return spearmanr(series1, series2)

    elif method == 'kendalltau':
        return kendalltau(series1, series2, nan_policy='omit')

    elif method == 'empirical':
        return empirical_pval(series1, series2, nreps)

    elif method == 'cohen':
        tmp = pd.concat((series1, series2), axis=1).dropna()
        return cohen_kappa_score(tmp.iloc[:, 0], tmp.iloc[:, 1]), np.nan

        raise ValueError('Unknown concordance method.')
def single_side_pathway_enrichment(pathway_definitions,
    """Identify overrepresented pathways using the Fisher's exact test for
    significance on a given pathway definition and gene signature.
    (FDR correction for multiple testing is applied in

    pathway_definitions : dict(str -> set(str))
      Pathway definitions, *post*-overlap-correction if this function
      is called from `pathway_enrichment_with_overlap_correction`.
      A pathway (key) is defined by a set of genes (value).
    gene_signature : set(str)
      The set of genes we consider to be enriched in a feature.
    n_genes : int
      The total number of genes for which we have assigned weights in the
      features of an unsupervised model.

    pandas.Series, for each pathway, the p-value from applying the Fisher's
      exact test.
    if not gene_signature:
        return pd.Series(name="p-value")
    pvalues_list = []
    for pathway, definition in pathway_definitions.items():
        if isinstance(definition, tuple):
            definition = set.union(*definition)

        both_definition_and_signature = len(definition & gene_signature)
        in_definition_not_signature = (len(definition) -
        in_signature_not_definition = (len(gene_signature) -
        neither_definition_nor_signature = (n_genes -
                                            both_definition_and_signature -
                                            in_definition_not_signature -
        contingency_table = np.array(
            [[both_definition_and_signature, in_signature_not_definition],
             [in_definition_not_signature, neither_definition_nor_signature]])
            _, pvalue = stats.fisher_exact(
                contingency_table, alternative="greater")
        # FPE can occur when `neither_definition_nor_signature` is very
        # large and `both_definition_and_signature` is very small (near zero)
        except FloatingPointError:
    pvalues_series = pd.Series(
        pvalues_list, index=pathway_definitions.keys(), name="p-value")
    return pvalues_series
项目:cohorts    作者:hammerlab    | 项目源码 | 文件源码
def fishers_exact_plot(data, condition1, condition2, ax=None,
                       alternative="two-sided", **kwargs):
    Perform a Fisher's exact test to compare to binary columns

    data: Pandas dataframe
        Dataframe to retrieve information from

    condition1: str
        First binary column to compare (and used for test sidedness)

    condition2: str
        Second binary column to compare

    ax : Axes, default None
        Axes to plot on

        If `condition1` is not a binary column, split on =/!= to condition1_value

        Specify the sidedness of the test: "two-sided", "less"
        or "greater"
    plot = sb.barplot(

    plot.set_ylabel("Percent %s" % condition2)
    condition1_mask = get_condition_mask(data, condition1, condition1_value)
    count_table = pd.crosstab(data[condition1], data[condition2])
    oddsratio, p_value = fisher_exact(count_table, alternative=alternative)
    add_significance_indicator(plot=plot, significant=p_value <= 0.05)

    if alternative != "two-sided":
        raise ValueError("We need to better understand the one-sided Fisher's Exact test")
    sided_str = "two-sided"
    print("Fisher's Exact Test: OR: {}, p-value={} ({})".format(oddsratio, p_value, sided_str))
    return FishersExactResults(oddsratio=oddsratio,