Python pandas 模块,melt() 实例源码

我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用pandas.melt()

项目:diamond    作者:stitchfix    | 项目源码 | 文件源码
def _create_main_design(self, **kwargs):
        r"""
        Create design matrix for main effects
        Keyword Args:
            * *df* (``DataFrame``). specify a new dataframe to create
                design matrix from
        Returns:
            array_like: design matrix in sparse CSR format

        """
        df = kwargs.get('df', self.train_df)
        df.reset_index(drop=True, inplace=True)
        df['row_index'] = df.index
        df['intercept'] = 1.0  # assume intercept is always included

        id_cols = ['row_index']

        melted_df = pd.melt(df[id_cols + self.main_effects], id_cols)
        melted_df = melted_df.merge(self.main_map, on='variable')
        melted_df['col_index'] = melted_df['main_idx']
        row = melted_df.row_index
        col = melted_df.col_index
        data = melted_df.value
        return sparse.coo_matrix((data, (row, col)),
                                 shape=(max(row) + 1, max(col) + 1)).tocsr()
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def missing_rate_plot(consensus_data, ordered_genomes, biotypes, missing_plot_tgt):
    """Missing genes/transcripts"""
    base_title = 'Number of missing orthologs in consensus set'
    gene_missing_df = json_biotype_counter_to_df(consensus_data, 'Gene Missing')
    gene_missing_df.columns = ['biotype', 'Genes', 'genome']
    transcript_missing_df = json_biotype_counter_to_df(consensus_data, 'Transcript Missing')
    transcript_missing_df.columns = ['biotype', 'Transcripts', 'genome']
    df = transcript_missing_df.merge(gene_missing_df, on=['genome', 'biotype'])
    df = pd.melt(df, id_vars=['biotype', 'genome'])
    ylabel = 'Number of genes or transcripts'
    with missing_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        tot_df = df.groupby(['genome', 'biotype', 'variable']).aggregate(sum).reset_index()
        generic_barplot(tot_df, pdf, '', ylabel, base_title, x='genome', y='value',
                        col='variable', row_order=ordered_genomes)
        for biotype in biotypes:
            biotype_df = biotype_filter(df, biotype)
            if biotype_df is None:
                continue
            biotype_df = biotype_df.groupby(['genome', 'variable']).aggregate(sum).reset_index()
            title = base_title + ' for biotype {}'.format(biotype)
            generic_barplot(biotype_df, pdf, '', ylabel, title, x='genome', y='value',
                            col='variable', row_order=ordered_genomes)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_custom_var_name(self):
        result5 = melt(self.df, var_name=self.var_name)
        self.assertEqual(result5.columns.tolist(), ['var', 'value'])

        result6 = melt(self.df, id_vars=['id1'], var_name=self.var_name)
        self.assertEqual(result6.columns.tolist(), ['id1', 'var', 'value'])

        result7 = melt(self.df, id_vars=['id1', 'id2'], var_name=self.var_name)
        self.assertEqual(result7.columns.tolist(), ['id1', 'id2', 'var',
                                                    'value'])

        result8 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                       var_name=self.var_name)
        self.assertEqual(result8.columns.tolist(), ['id1', 'id2', 'var',
                                                    'value'])

        result9 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                       var_name=self.var_name)
        expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                               'id2': self.df['id2'].tolist() * 2,
                               self.var_name: ['A'] * 10 + ['B'] * 10,
                               'value': (self.df['A'].tolist() +
                                         self.df['B'].tolist())},
                              columns=['id1', 'id2', self.var_name, 'value'])
        tm.assert_frame_equal(result9, expected9)
项目:microbiomeHD    作者:cduvallet    | 项目源码 | 文件源码
def tidyfy_df(df):
    """
    Returns tidy df pivoted around 'otu', for the aggregate ubiquity and abundance
    measures.

    Input df should have columns labeled like "ubiquity_calc_type_patients" or
    "abundance_calc_type_patients" where the first underscore-delimited value
    is "abundance" or "ubiquity" and the last one is "dis", "h", or "total"
    (or some other patient type indicator). The middle values are the type of
    calculation used (e.g. "from_pooled_calc", "mean_of_datasets")

    Note that columns with 'in_one_dataset' are discarded.
    """

    id_vars = ['otu']
    value_vars = [i for i in df.columns if i.startswith('ubiquity') or i.startswith('abundance')]
    value_vars = [i for i in value_vars if 'in_one_dataset' not in i]

    tidydf = pd.melt(df, id_vars=id_vars, value_vars=value_vars).drop_duplicates()

    tidydf['metric'] = tidydf['variable'].apply(lambda x: x.split('_')[0])
    tidydf['calculation'] = tidydf['variable'].apply(lambda x: x.split('_',1)[1].rsplit('_',1)[0])
    tidydf['patient'] = tidydf['variable'].apply(lambda x: x.split('_')[-1])

    return tidydf
项目:gini-index    作者:datasets    | 项目源码 | 文件源码
def main():
    giniIndex = pd.read_csv(source)
    giniIndex.to_csv('archive/gini-index.csv', sep=",", index_col=0, index=False) 
    print("Saved archive CSV file.")
    print (giniIndex)

    # Processing the data
    df = pd.read_csv('archive/gini-index.csv')      # Reading the source csv
    """
    Python is printing "Country Name" with quotes in data frame and does not
    work for the remaining code
    """
    df.columns.values[0] = 'Country Name'

    df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value")     # Unpivoting
    df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True]) # Sorting by country

    df.dropna().to_csv('data/gini-index.csv', sep=",", index=False)   # Saving CSV
    print ("File has been saved and it is ready for data packaging.")
项目:bubble_plot    作者:shirmeir    | 项目源码 | 文件源码
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                         pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
    count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
    count_table = count_table.unstack()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    z_boolean_values = count_table_long[z_boolean].unique()
    ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
    count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
    count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
    size_factor = maximal_bubble_size/count_table_long['value'].max()
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict)
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['ratio'],  alpha=0.5,
                cmap='cool')
    return count_table_long, xticks, yticks, xticklabels, yticklabels
项目:fitbit-analyzer    作者:5agado    | 项目源码 | 文件源码
def _plotWeekdayStats(stats, columns, groupBy=True):
    dataToPlot = stats.copy()
    # Group by weekday and rename date column
    if groupBy:
        dataToPlot = dataToPlot.groupby(stats['date'].dt.weekday).mean()
        dataToPlot = dataToPlot.reset_index().rename(columns={'date':'weekday'})

    # change stats from columns to row attribute
    dataToPlot = pd.melt(dataToPlot, id_vars=['weekday'], value_vars=columns,
                         var_name='stats', value_name='val')
    # Rename stats and weekdays
    dataToPlot['stats'].replace(NAMES, inplace=True)
    dataToPlot['weekday'].replace(dayOfWeek, inplace=True)
    # Plot
    g = sns.factorplot(data=dataToPlot, x="weekday", y="val", col="stats",
                       order=dayOfWeekOrder, kind="point", sharey=False, col_wrap=3)
    g.set_xticklabels(rotation=45)
    g.set(xlabel='')
    return g
    #sns.plt.show()
项目:quail    作者:ContextLab    | 项目源码 | 文件源码
def format2tidy(df, subjname, listname, subjgroup, **attrs):

    melted_df = pd.melt(df.T)
    melted_df[subjname]=""
    for idx,sub in enumerate(melted_df['Subject'].unique()):
        melted_df.loc[melted_df['Subject']==sub,subjname]=subjgroup[idx]
    if attrs['analysis_type'] in ['spc']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Proportion Recalled', subjname, 'Position']
    elif attrs['analysis_type'] in ['pfr', 'pnr']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Probability of Recall: Position ' + str(attrs['n']), subjname, 'Position']
    elif attrs['analysis_type'] is 'lagcrp':
        base = range(int(-len(df.columns.values)/2),int(len(df.columns.values)/2)+1)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Conditional Response Probability', subjname, 'Position']
    elif attrs['analysis_type'] is 'fingerprint' or attrs['analysis_type'] is 'fingerprint_temporal':
        base = list(df.columns.values)
        melted_df['Feature'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Clustering Score', subjname, 'Feature']
    elif attrs['analysis_type'] is 'accuracy':
        melted_df.columns = ['Subject', listname, 'Accuracy', subjname]
    elif attrs['analysis_type'] is 'temporal':
        melted_df.columns = ['Subject', listname, 'Temporal Clustering Score', subjname]


    return melted_df
项目:quail    作者:ContextLab    | 项目源码 | 文件源码
def format2tidy(df, subjname, listname, subjgroup, **attrs):

    melted_df = pd.melt(df.T)
    melted_df[subjname]=""
    for idx,sub in enumerate(melted_df['Subject'].unique()):
        melted_df.loc[melted_df['Subject']==sub,subjname]=subjgroup[idx]
    if attrs['analysis_type'] in ['spc']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Proportion Recalled', subjname, 'Position']
    elif attrs['analysis_type'] in ['pfr', 'pnr']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Probability of Recall: Position ' + str(attrs['n']), subjname, 'Position']
    elif attrs['analysis_type'] is 'lagcrp':
        base = range(int(-len(df.columns.values)/2),int(len(df.columns.values)/2)+1)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Conditional Response Probability', subjname, 'Position']
    elif attrs['analysis_type'] is 'fingerprint' or attrs['analysis_type'] is 'fingerprint_temporal':
        base = list(df.columns.values)
        melted_df['Feature'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Clustering Score', subjname, 'Feature']
    elif attrs['analysis_type'] is 'accuracy':
        melted_df.columns = ['Subject', listname, 'Accuracy', subjname]
    elif attrs['analysis_type'] is 'temporal':
        melted_df.columns = ['Subject', listname, 'Temporal Clustering Score', subjname]


    return melted_df
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def plot_aic_bic_fig(tasks):
    """
    Creates AIC-BIC plot, as a 2-row x 3-col grid of point plots with 95% confidence intervals.

    Parameters
    ----------
    tasks: list(dict)

    Returns
    -------
    Matplotlib Figure object
    """
    sns.set(context='talk', style='whitegrid')
    # Filter list of dicts to reduce the size of Pandas DataFrame
    df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']))
    df['covar_type'] = [x.capitalize() for x in df['covar_type']]
    df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
    df['aic'] = df['aic'].astype('float')
    df['bic'] = df['bic'].astype('float')
    df = pd.melt(df, id_vars=['k', 'covar_type', 'covar_tied'], value_vars=['aic', 'bic'], var_name='metric')
    f = sns.factorplot(x='k', y='value', col='covar_type', row='covar_tied', hue='metric', data=df,
                       row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
                       ci=95, n_boot=100)
    f.set_titles("{col_name}-{row_name}")
    f.set_xlabels("Num. of Clusters (K)")
    return f.fig
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def generate_metrics(self):
        """Given a model id and a set of thresholds, obtain the y values (true
        class and predicted probability) and calculate metrics for the
        model at each threshold.

        :param batch_timestamp: timestamps of model batches
        :type batch_timestamp: list
        :returns: None -- always returns None as default
        :rtype: None
        """
        # get the y-values
        y_values = self.get_y_values()

        # generate metrics at thresholds
        eval_metrics_pct = self.threshold_pct.apply(self.evaluate_model_at_threshold,
                                               args = (y_values['scores'],
                                                       y_values['y_true'],
                                                       True))
        eval_metrics_abs = self.threshold_abs.apply(self.evaluate_model_at_threshold,
                                               args = (y_values['scores'],
                                                       y_values['y_true'],
                                                       False))

        # build table of metrics
        eval_metrics = pd.concat([eval_metrics_pct, eval_metrics_abs])
        eval_metrics_long = pd.melt(eval_metrics, id_vars = ['parameter'],
                                    var_name = 'metric')
        eval_metrics_long['unique_timestamp'] = self.model_id
        auc = self.compute_AUC(y_values['y_true'], y_values['scores'])
        final_metrics = eval_metrics_long.append({'parameter': 'roc',
            'metric': 'auc',
            'value': auc,
            'unique_timestamp': self.model_id},
            ignore_index = True)
        metrics_cols = ['parameter', 'metric', 'value', 'unique_timestamp']
        final_metrics = final_metrics[metrics_cols]

        return(final_metrics)
项目:diamond    作者:stitchfix    | 项目源码 | 文件源码
def test_setUp(self, tol=0.02):
        # assumes working directory is diamond/
        folder = "diamond/integration_tests/logistic"

        simulated_data_loc = "%s/simulated_logistic_df.csv" % folder
        estimated_covariance_loc = "%s/simulated_logistic_covariance.csv" % folder
        resources_exist = os.path.exists(simulated_data_loc) and os.path.exists(estimated_covariance_loc)
        if not resources_exist:
            logging.info("Simulating data and estimating covariances in R")
            os.system("/usr/local/bin/Rscript %s/logistic_generate_and_fit.R" % folder)
        logging.info("Reading in training data and R::lme4-estimated covariance matrix")
        df_train = pd.read_csv(simulated_data_loc)
        df_estimated_covariance = pd.read_csv(estimated_covariance_loc)

        self.model = LogisticRegression(train_df=df_train,
                                        priors_df=df_estimated_covariance,
                                        copy=True,
                                        test_df=None)
        logging.info("Fitting model in diamond")
        self.formula = "y ~ 1 + x + (1 + x | level)"
        results = self.model.fit(self.formula, tol=1e-4, verbose=True)

        # the format of the coefficient vector is:
        # fixed effects, then [random intercept, random slope] for each level
        beta_hat = np.append(results["fixed_effects"].value.values,
                             pd.melt(results["level"], "level").sort_values(["level", "variable"]).value.values)

        beta_true = pd.read_csv("%s/simulated_logistic_true_parameters.csv" % folder)["x"].values
        rel_error = np.mean((beta_hat - beta_true) ** 2) / np.mean(abs(beta_true))
        if rel_error > tol:
            logging.warn("relative error = %f > tolerance = %f" % (rel_error, tol))
        else:
            logging.info("relative error = %f < tolerance = %f" % (rel_error, tol))
        # make sure the coefficients are very close
        self.assertTrue(rel_error < tol)
项目:statscraper    作者:jplusplus    | 项目源码 | 文件源码
def _clean_data(self, df, year, month):
        df = df.dropna(how='all', axis=1)
        df = df.dropna(how='all', axis=0)
        df = df.drop('Totalsumma', axis=1)
        df = df.rename(columns={'Unnamed: 1': 'vehicle_type'})
        df = df[df['vehicle_type'] != 'Totalsumma']
        df.loc[:, 'year'] = year
        df.loc[:, 'month'] = month
        df = pd.melt(df,
                     id_vars=['vehicle_type', 'month', 'year'],
                     value_vars=['AVREGISTRERAD', 'AVSTÄLLD', 'ITRAFIK'],
                     var_name='status')
        return df
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def tm_metrics_plot(tm_metrics, ordered_genomes, biotypes, transcript_biotype_map, tm_coverage_tgt, tm_identity_tgt):
    """plots for transMap coverage, identity"""
    tm_iter = zip(*[['transMap Coverage', 'transMap Identity'],
                    [tm_coverage_tgt, tm_identity_tgt]])
    for mode, tgt in tm_iter:
        df = dict_to_df_with_biotype(tm_metrics[mode], transcript_biotype_map)
        df = pd.melt(df, id_vars='biotype', value_vars=ordered_genomes).dropna()
        df.columns = ['biotype', 'genome', mode]
        cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y='genome')
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def consensus_support_plot(consensus_data, ordered_genomes, biotypes, modes, title, tgt):
    """grouped violin plots of original intron / intron annotation / exon annotation support"""
    def adjust_plot(g, this_title):
        g.set_xticklabels(rotation=90)
        g.fig.suptitle(this_title)
        g.fig.subplots_adjust(top=0.9)
        for ax in g.axes.flat:
            ax.set_ylabel('Percent supported')
            ax.set_ylim(-1, 101)

    dfs = []
    for i, mode in enumerate(modes):
        df = json_to_df_with_biotype(consensus_data, mode)
        if i > 0:
            df = df[mode]
        dfs.append(df)
    df = pd.concat(dfs, axis=1)
    df = pd.melt(df, value_vars=modes, id_vars=['genome', 'biotype'])
    with tgt.open('w') as outf, PdfPages(outf) as pdf:
        if len(ordered_genomes) > 1:
            g = sns.factorplot(data=df, y='value', x='genome', col='variable', col_wrap=2, kind='violin', sharex=True,
                               sharey=True, row_order=ordered_genomes, cut=0)
        else:
            g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True,
                               sharey=True, row_order=ordered_genomes, cut=0)
        adjust_plot(g, title)
        multipage_close(pdf, tight_layout=False)
        title += ' for {}'
        for biotype in biotypes:
            this_title = title.format(biotype)
            biotype_df = biotype_filter(df, biotype)
            if biotype_df is not None:
                if len(ordered_genomes) > 1:
                    g = sns.factorplot(data=biotype_df, y='value', x='genome', col='variable', col_wrap=2,
                                       kind='violin', sharex=True, sharey=True, row_order=ordered_genomes, cut=0)
                else:
                    g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True,
                                       sharey=True, row_order=ordered_genomes, cut=0)
                adjust_plot(g, this_title)
                multipage_close(pdf, tight_layout=False)
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def tx_modes_plot(consensus_data, ordered_genomes, tx_mode_plot_tgt):
    ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB',
                      'Other']
    ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups])

    def split_fn(s):
        return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other')

    modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes')
    df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index()
    df['Modes'] = df.apply(split_fn, axis=1)
    df = df[['Modes'] + ordered_genomes]
    ordered_values = [x for x in ordered_groups.itervalues() if x in set(df['Modes'])]
    with tx_mode_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        title_string = 'Transcript modes in protein coding consensus gene set'
        ylabel = 'Number of transcripts'
        if len(ordered_genomes) > 1:
            df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True)
            df = df.sort_values('Ordered Modes')
            df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes')
            df = df.fillna(0)
            generic_stacked_barplot(df, pdf, title_string, df.index, ylabel, ordered_genomes, 'Transcript mode(s)',
                                    bbox_to_anchor=(1.25, 0.7))

        else:
            generic_barplot(pd.melt(df, id_vars='Modes'), pdf, 'Transcript mode(s)', ylabel, title_string, x='Modes',
                            y='value', order=ordered_values)
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def indel_plot(consensus_data, ordered_genomes, indel_plot_tgt):
    with indel_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        tm_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['transMap Indels'], orient='index').T
                           for genome in ordered_genomes])
        tm_df['genome'] = ordered_genomes
        tm_df['transcript set'] = ['transMap'] * len(tm_df)
        consensus_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['Consensus Indels'], orient='index').T
                                  for genome in ordered_genomes])
        consensus_df['genome'] = ordered_genomes
        consensus_df['transcript set'] = ['Consensus'] * len(consensus_df)
        df = pd.concat([consensus_df, tm_df])
        df = pd.melt(df, id_vars=['genome', 'transcript set'],
                     value_vars=['CodingDeletion', 'CodingInsertion', 'CodingMult3Indel'])
        df.columns = ['Genome', 'Transcript set', 'Type', 'Percent of transcripts']
        g = sns.factorplot(data=df, x='Genome', y='Percent of transcripts', col='Transcript set',
                           hue='Type', kind='bar', row_order=ordered_genomes,
                           col_order=['transMap', 'Consensus'])
        g.set_xticklabels(rotation=90)
        g.fig.subplots_adjust(top=.8)
        g.fig.suptitle('Coding indels')
        multipage_close(pdf, tight_layout=False)


###
# shared plotting functions
###
项目:deepcpg    作者:cangermueller    | 项目源码 | 文件源码
def plot_lc(lc, metrics=None, outputs=False):
    lc = pd.melt(lc, id_vars=['split', 'epoch'], var_name='output')
    if metrics:
        if not isinstance(metrics, list):
            metrics = [metrics]
        tmp = '(%s)' % ('|'.join(metrics))
        lc = lc.loc[lc.output.str.contains(tmp)]
    metrics = lc.output[~lc.output.str.contains('_')].unique()
    lc['metric'] = ''

    for metric in metrics:
        lc.loc[lc.output.str.contains(metric), 'metric'] = metric
        lc.loc[lc.output == metric, 'output'] = 'mean'
        lc.output = lc.output.str.replace('_%s' % metric, '')
        lc.output = lc.output.str.replace('cpg_', '')

    if outputs:
        lc = lc.loc[lc.output != 'mean']
    else:
        lc = lc.loc[lc.output == 'mean']

    grid = sns.FacetGrid(lc, col='split', row='metric', hue='output',
                         sharey=False, size=3, aspect=1.2, legend_out=True)
    grid.map(mpl.pyplot.plot, 'epoch', 'value', linewidth=2)
    grid.set(ylabel='')
    grid.add_legend()
    return grid
项目:deepcpg    作者:cangermueller    | 项目源码 | 文件源码
def plot_stats(stats):
    stats = stats.sort_values('frac_obs', ascending=False)
    stats = pd.melt(stats, id_vars=['output'], var_name='metric')
    #  stats = stats.loc[stats.metric.isin(['frac_obs', 'frac_one'])]
    #  stats.metric = stats.metric.str.replace('frac_obs', 'cov')
    #  stats.metric = stats.metric.str.replace('frac_one', 'met')
    grid = sns.FacetGrid(data=stats, col='metric', sharex=False)
    grid.map(sns.barplot, 'value', 'output')
    for ax in grid.axes.ravel():
        ax.set(xlabel='', ylabel='')
    return grid
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_default_col_names(self):
        result = melt(self.df)
        self.assertEqual(result.columns.tolist(), ['variable', 'value'])

        result1 = melt(self.df, id_vars=['id1'])
        self.assertEqual(result1.columns.tolist(), ['id1', 'variable', 'value'
                                                    ])

        result2 = melt(self.df, id_vars=['id1', 'id2'])
        self.assertEqual(result2.columns.tolist(), ['id1', 'id2', 'variable',
                                                    'value'])
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_value_vars(self):
        result3 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A')
        self.assertEqual(len(result3), 10)

        result4 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'])
        expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                               'id2': self.df['id2'].tolist() * 2,
                               'variable': ['A'] * 10 + ['B'] * 10,
                               'value': (self.df['A'].tolist() +
                                         self.df['B'].tolist())},
                              columns=['id1', 'id2', 'variable', 'value'])
        tm.assert_frame_equal(result4, expected4)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_custom_value_name(self):
        result10 = melt(self.df, value_name=self.value_name)
        self.assertEqual(result10.columns.tolist(), ['variable', 'val'])

        result11 = melt(self.df, id_vars=['id1'], value_name=self.value_name)
        self.assertEqual(result11.columns.tolist(), ['id1', 'variable', 'val'])

        result12 = melt(self.df, id_vars=['id1', 'id2'],
                        value_name=self.value_name)
        self.assertEqual(result12.columns.tolist(), ['id1', 'id2', 'variable',
                                                     'val'])

        result13 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                        value_name=self.value_name)
        self.assertEqual(result13.columns.tolist(), ['id1', 'id2', 'variable',
                                                     'val'])

        result14 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                        value_name=self.value_name)
        expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                                'id2': self.df['id2'].tolist() * 2,
                                'variable': ['A'] * 10 + ['B'] * 10,
                                self.value_name: (self.df['A'].tolist() +
                                                  self.df['B'].tolist())},
                               columns=['id1', 'id2', 'variable',
                                        self.value_name])
        tm.assert_frame_equal(result14, expected14)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_custom_var_and_value_name(self):

        result15 = melt(self.df, var_name=self.var_name,
                        value_name=self.value_name)
        self.assertEqual(result15.columns.tolist(), ['var', 'val'])

        result16 = melt(self.df, id_vars=['id1'], var_name=self.var_name,
                        value_name=self.value_name)
        self.assertEqual(result16.columns.tolist(), ['id1', 'var', 'val'])

        result17 = melt(self.df, id_vars=['id1', 'id2'],
                        var_name=self.var_name, value_name=self.value_name)
        self.assertEqual(result17.columns.tolist(), ['id1', 'id2', 'var', 'val'
                                                     ])

        result18 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                        var_name=self.var_name, value_name=self.value_name)
        self.assertEqual(result18.columns.tolist(), ['id1', 'id2', 'var', 'val'
                                                     ])

        result19 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                        var_name=self.var_name, value_name=self.value_name)
        expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                                'id2': self.df['id2'].tolist() * 2,
                                self.var_name: ['A'] * 10 + ['B'] * 10,
                                self.value_name: (self.df['A'].tolist() +
                                                  self.df['B'].tolist())},
                               columns=['id1', 'id2', self.var_name,
                                        self.value_name])
        tm.assert_frame_equal(result19, expected19)

        df20 = self.df.copy()
        df20.columns.name = 'foo'
        result20 = melt(df20)
        self.assertEqual(result20.columns.tolist(), ['foo', 'value'])
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_multiindex(self):
        res = pd.melt(self.df1)
        self.assertEqual(res.columns.tolist(), ['CAP', 'low', 'value'])
项目:microbiomeHD    作者:cduvallet    | 项目源码 | 文件源码
def pvals_to_long(pvals):
    """
    Given dataframe with signed p-values, convert to longform
    with columns: otu, study, direction, pval, sample_size.

    Parameters
    ----------
    pvals : pandas DataFrame
        Genera in rows, studies in columns, signed pvalues in values.
        Positive indicates higher in disease, negatives is higher in healthy.

    Returns
    -------
    longpvals : pandas DataFrame
        Tidy dataframe with columns otu, study, direction, and pval (for that
        direction)
    """
    pvals.index.name = 'otu'
    pvals = pvals.reset_index()
    longpvals = pd.melt(pvals, id_vars='otu', var_name='dataset',
                        value_name='signed_qvalue').dropna()

    # Convert all p-values to health-associated pvalue
    # Original p-values were calculated from KW test, making them two-sided.
    # If the pvalue is negative, then abs(p)/2 is the health-associated pval.
    # If the pvalue is positive, then 1 - abs(p)/2 is the health-associated
    # pvalue.
    p_to_healthy = lambda x: abs(x)/2.0 if x <= 0  else 1-abs(x)/2.0
    longpvals['q'] = longpvals['signed_qvalue'].map(p_to_healthy)
    longpvals['direction'] = 'healthy'

    # Now add the disease-associated qvalues
    disqs = copy.deepcopy(longpvals)
    disqs['direction'] = 'disease'
    disqs['q'] = 1 - disqs['q']

    longpvals = pd.concat((longpvals, disqs))

    return longpvals
项目:bubble_plot    作者:shirmeir    | 项目源码 | 文件源码
def plot_without_z(df, x, y, z, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, normalization_by_all=False, log=False, maximal_bubble_size=4000):
    if normalization_by_all:
        count_table /= count_table.sum().sum()
    else:
        count_table = count_table.transpose()
        for col in count_table.columns:
            count_table[col] /= count_table[col].sum()
        count_table = count_table.transpose()
    if log:
        count_table = np.log(count_table)
        maximal_bubble_size /= 2
    size_factor = maximal_bubble_size/count_table.max().max()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(count_table.shape[0]) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(count_table.shape[1]) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict) 
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['value'], cmap='cool')

    return count_table_long, xticks, yticks, xticklabels, yticklabels
项目:Waskom_PNAS_2017    作者:WagnerLabPapers    | 项目源码 | 文件源码
def plot_points(df, axes):

    for exp, ax in zip(["dots", "sticks", "rest"], axes):

        exp_df = pd.melt(df.query("exp == @exp"),
                         "subj", ["within", "between"], "test", "corr")

        sns.pointplot(x="test", y="corr", hue="test", data=exp_df,
                      dodge=.5, join=False, ci=95,
                      palette=[".15", ".5"], ax=ax)
        plt.setp(ax.lines, linewidth=2)

        sns.pointplot(x="test", y="corr", hue="subj", data=exp_df,
                      palette=[".75"], scale=.75, ax=ax)
        plt.setp(ax.collections[:], facecolor="w", zorder=20)

        ax.legend_ = None
        ax.set(ylabel="",
               xlabel="",
               xticks=[-.1, 1.1],
               xticklabels=["Same\ncontext", "Different\ncontext"])

    axes[0].set(ylim=(0, .105), ylabel="Timeseries correlation (r)")
    axes[1].set(ylim=(0, .0525))
    axes[2].set(ylim=(0, .0525))

    for ax in axes:
        sns.despine(ax=ax, trim=True)
项目:kotori    作者:daq-tools    | 项目源码 | 文件源码
def dataframe_wide_to_long_indexed(df, column):
    """
    Convert DataFrame from wide to long format using specified column as index column,
    followed by indexing the DataFrame on the very same column and finally sorting it.

    See also:

    - http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-by-melt
    - http://stackoverflow.com/questions/17688155/complicated-for-me-reshaping-from-wide-to-long-in-pandas
    """
    df = pandas.melt(df, id_vars=column).dropna()
    df = dataframe_index_and_sort(df, column)
    return df
项目:sgrsea    作者:bchen4    | 项目源码 | 文件源码
def generatefinaltable(resultdic, totaldic, lib, dfile):
  '''
    merge df based on sublib  
  ''' 
  #Generate count table
  count_df = pd.DataFrame()
  for sublibname, c_df in resultdic.items():
    sublib = lib[sublibname]
    df = sublib.merge(c_df,on='Sequence',how='left')
    df = df.fillna(0)
    count_df = count_df.append(df)
  #Generate summary table
  count_columns = count_df.columns.tolist()
  count_columns.insert(0,count_columns.pop(count_columns.index('sgRNA')))
  count_columns.insert(1,count_columns.pop(count_columns.index('Gene')))
  count_columns.insert(2,count_columns.pop(count_columns.index('Sequence')))
  count_columns.insert(3,count_columns.pop(count_columns.index('sublib')))
  count_df = count_df.loc[:,count_columns]
  mapped_total = count_df.iloc[:,3:].groupby("sublib").sum().reset_index()
  mapped_total_df = pd.melt(mapped_total, id_vars=['sublib'],var_name=['sample'],value_name='mapped_reads')
  totalread_df = pd.DataFrame(totaldic.items(),columns=["filepath","total_reads"])
  if isinstance(dfile,pd.DataFrame):
    summary_df = dfile.merge(totalread_df,on="filepath")
    summary_df = summary_df.merge(mapped_total_df,on=['sublib','sample'])
  else:#single file
    summary_df = totalread_df
    summary_df = summary_df.join(mapped_total_df)
  summary_df['mapping_ratio'] = summary_df['mapped_reads']/summary_df['total_reads']
  summary_df = summary_df.loc[:,['filepath','sample','sublib','total_reads','mapped_reads','mapping_ratio']]
  return (count_df, summary_df)
项目:fitbit-analyzer    作者:5agado    | 项目源码 | 文件源码
def _prepareYearAndMonthStats(stats, columns):
    # Group by month and change stats from columns to row attribute
    dataToPlot = stats.groupby(stats['date'].dt.to_period("M")).mean()
    dataToPlot = pd.melt(dataToPlot.reset_index(), id_vars=['date'], value_vars=columns,
                         var_name='stats', value_name='val')
    # Rename stats
    dataToPlot['stats'].replace(NAMES, inplace=True)
    return dataToPlot
项目:Black-Swan    作者:12190143    | 项目源码 | 文件源码
def sarimax_predict():

    model, prediction = {}, {}
    data = ['T1D0'] #, 'T1D1', 'T2D0', 'T3D0', 'T3D1'
    train_data = pd.read_csv('sarimax_data.csv',index_col=0)
    in_model_pkl = 'SARIMAX_6_0_1_1_0_1_72_%s.pkl'
    in_model_path = '../../data/data_after_process/tmp_file'

    for i in data:
        model[i] = joblib.load(path.join(in_model_path, in_model_pkl % (i)))
        # print results[td].summary()
        print i + ' model start predicting!'
        prediction[i] = model[i].predict(0, len(train_data)-1)
        prediction[i] = prediction[i].map(lambda x: np.round(np.exp(x) - 1, 2))

    answer = pd.DataFrame(prediction)['2016-10-25':]
    answer = pd.concat([
        answer.between_time('17:00', '18:40'), answer.between_time('8:00', '9:40')
    ]).sort_index()
    answer['time_window'] = answer.index.map(lambda x:'['+str(x)+','+str(x+Minute(20))+')')
    answer = pd.melt(
        answer,
        var_name='tollgate_id',
        value_name='volume',
        id_vars=['time_window'])

    answer['direction'] = answer['tollgate_id'].map(lambda d: int(d[3]))
    answer['tollgate_id'] = answer['tollgate_id'].map(lambda d: int(d[1]))
    answer = answer[['tollgate_id','time_window','direction','volume']]

    # import time
    # version = time.strftime('%Y-%m-%d_%R', time.localtime(time.time()))
    # answer.to_csv('answer/prediction_'+version+'.csv',float_format='%.2f',header=True,index=False,encoding='utf-8')
    answer.to_csv('../../answer/prediction_sarimax.csv',float_format='%.2f',header=True,index=False,encoding='utf-8')
项目:dplython    作者:dodger487    | 项目源码 | 文件源码
def __call__(self, df):
    df_cols = df.columns.values.tolist()
    id_vals = [col._name for col in self.args[2]]
    id_vars = [col for col in df_cols if col not in id_vals]
    key = self.args[0]
    value = self.args[1]
    return pandas.melt(df, id_vars, id_vals, key, value)
项目:crop-seq    作者:epigen    | 项目源码 | 文件源码
def inspect_bulk(df, df_bulk, de_genes, de_genes_bulk):
    """
    """
    quant_types = [("bitseq", df_bulk)]

    for quant_type, exp_matrix in quant_types:
        print(quant_type)

        # Boxplots of expression
        fig, axis = plt.subplots(1)
        sns.boxplot(data=pd.melt(exp_matrix), x="grna", y="value", hue="condition", ax=axis)
        fig.savefig(os.path.join("results", "bulk", "bulk_samples.qc.{}.expression_boxplots.png".format(quant_type)), dpi=300, bbox_inches="tight")

        # Heatmap and correlation on signature genes
        # derived from bulk
        # derived from scRNA
        for geneset in ["de_genes", "de_genes_bulk"]:
            g = sns.clustermap(
                exp_matrix.ix[eval(geneset)].dropna(),
                z_score=0,
                row_cluster=True, col_cluster=True,
                xticklabels=True, yticklabels=True,
                figsize=(15, 15))
            for item in g.ax_heatmap.get_yticklabels():
                item.set_rotation(0)
            for item in g.ax_heatmap.get_xticklabels():
                item.set_rotation(90)
            g.fig.savefig(os.path.join("results", "bulk", "bulk_samples.qc.{}.{}.png".format(quant_type, geneset)), dpi=300, bbox_inches="tight")

            g = sns.clustermap(
                exp_matrix.ix[eval(geneset)].dropna().corr(),
                row_cluster=True, col_cluster=True,
                xticklabels=True, yticklabels=True,
                figsize=(15, 15))
            for item in g.ax_heatmap.get_yticklabels():
                item.set_rotation(0)
            for item in g.ax_heatmap.get_xticklabels():
                item.set_rotation(90)
            g.fig.savefig(os.path.join("results", "bulk", "bulk_samples.qc.{}.{}.correlation.png".format(quant_type, geneset)), dpi=300, bbox_inches="tight")
项目:diamond    作者:stitchfix    | 项目源码 | 文件源码
def _create_inter_design(self, g, **kwargs):
        r"""
        Create random effects design matrix for grouping factor g
        This is straightforward when you create the matrix using the training
            DataFrame
        But a new DataFrame can have new levels of g which did not exist in
            training DF
        For these levels, the random coefficients are set to zero
        But as a practical matter, it's easier to zero out the values of the
            predictors
        here than it is to modify the fitted coefficient vector
        Args:
            g (string): grouping factor to create design matrix
        Keyword Args:
            * *df* (``DataFrame``). specify a new dataframe to create
                design matrix from
        Returns:
            array_like : design matrix in sparse CSR format
        """
        idx = g + '_idx'

        df = kwargs.get('df', self.train_df)
        df.reset_index(drop=True, inplace=True)
        df['row_index'] = df.index
        if 'intercept' in self.groupings[g]:
            df['intercept'] = 1.0

        id_cols = [g, 'row_index']

        # level_maps has levels of g and an index for each level
        melted_inter = pd.melt(df[id_cols + self.groupings[g]], id_cols).merge(
            self.level_maps[g], how='left', on=g).merge(
            self.inter_maps[g], how='inner', on='variable')
        # inter_maps has variables in formula for this grouping factor,
        # plus indexes
        # because of the above left join, some idx values are NULL
        # but we need to keep the row indexes
        nrows = max(melted_inter.row_index) + 1
        # now drop the null column index
        melted_inter.dropna(inplace=True)

        melted_inter.sort_values(by=[g, idx], inplace=True)
        melted_inter['col_index'] = melted_inter['inter_idx'] + \
            melted_inter[idx] * len(self.groupings[g])

        row = melted_inter.row_index
        col = melted_inter.col_index
        data = melted_inter.value

        if g in self.grouping_designs.keys():
            # this means training matrix was already created for this group
            # reuse the same shape: indices are lined up,
            # everything else will be 0 b/c of sparse matrix
            ncols = self.grouping_designs[g].shape[1]
        else:
            ncols = max(col) + 1

        return sparse.coo_matrix((data, (row, col)),
                                 shape=(nrows, ncols)).tocsr()
项目:diamond    作者:stitchfix    | 项目源码 | 文件源码
def test_setUp(self, tol=0.02):
        # assumes working directory is diamond/
        folder = "diamond/integration_tests/clogistic"
        simulated_data_loc = "%s/simulated_clogistic_df.csv" % folder
        estimated_covariance_loc = "%s/simulated_clogistic_covariance.csv" % folder
        resources_exist = os.path.exists(simulated_data_loc) and os.path.exists(estimated_covariance_loc)
        if not resources_exist:
            logging.info("Simulating data and estimating covariances in R")
            os.system("/usr/local/bin/Rscript %s/clogistic_generate_and_fit.R" % folder)
        logging.info("Reading in training data and R::ordinal-estimated covariance matrix")

        df_train = pd.read_csv(simulated_data_loc)
        df_estimated_covariance = pd.read_csv(estimated_covariance_loc)

        self.formula = "y ~ x + (1 + x | level)"

        self.model = CumulativeLogisticRegression(train_df=df_train,
                                                  priors_df=df_estimated_covariance,
                                                  copy=True,
                                                  test_df=None)
        logging.info("Fitting model in diamond")
        results = self.model.fit(self.formula, tol=1e-3, max_its=5, verbose=True)

        # the format of the coefficient vector is:
        # fixed effects, then [random intercept, random slope] for each level
        beta_hat = np.append(results["main"]["main_value"].values,
                             pd.melt(results["level"], "level").sort_values(["level", "variable"]).value.values)

        # drop the 0 value at the head of beta_true
        # this is a placeholder, which reflects that there is no fixed intercept in this model
        beta_true = pd.read_csv("%s/simulated_clogistic_true_parameters.csv" % folder)["x"].values[1:]
        rel_error = np.mean((beta_hat - beta_true) ** 2) / np.mean(abs(beta_true))
        if rel_error > tol:
            logging.warn("relative error of coefs = %f > tolerance = %f" % (rel_error, tol))
        else:
            logging.info("relative error of coefs = %f < tolerance = %f" % (rel_error, tol))
        # make sure the coefficients are very close
        self.assertTrue(rel_error < tol)

        # check intercepts, too
        alpha_true = pd.read_csv("%s/simulated_clogistic_true_intercepts.csv" % folder).ix[1:3, "x"].values
        alpha_hat = results["intercepts"]
        rel_error_alpha = np.mean((alpha_hat - alpha_true) ** 2) / np.mean(abs(alpha_true))

        if rel_error_alpha > tol:
            logging.warn("relative error of intercepts = %f > tolerance = %f" % (rel_error_alpha, tol))
        else:
            logging.info("relative error of intercepts = %f < tolerance = %f" % (rel_error_alpha, tol))
        self.assertTrue(rel_error_alpha < tol)
项目:microbiomeHD    作者:cduvallet    | 项目源码 | 文件源码
def reproducibility_from_fisher(disdf, samplesizes, qthresh):
    """
    Returns the number of 'reproducible' OTUs based on weighted Fisher's method.
    Note: if I ever want to actually use these Fisher p-values, I could
    break up this function to return the `metap` dataframe

    Parameters
    ----------
    disdf : pandas dataframe
        genera in rows, datasets in columns, signed q-values in values
    samplesizes : pandas dataframe
        datasets in rows and at least column 'total' with total number
        of samples in each dataset, to use as weight for Stouffer's method
    qthresh : float
        threshold for calling a fisher meta-q value "significant"

    Returns
    -------
    n_sig : int
        total number of genera significant via fisher's method
    """

    ## Turn disdf into tidy dataframe
    longpvals = copy.deepcopy(disdf)
    longpvals['otu'] = longpvals.index
    longpvals = pd.melt(longpvals, id_vars='otu',
                        value_name='p', var_name='study')

    ## Convert two-tailed signed p-values into one-tailed pvalues
    longpvals = convert_to_one_tailed(longpvals).dropna()
    longpvals = pd.melt(longpvals, id_vars=['otu', 'study'],
                        value_vars=['p-dis', 'p-h'], var_name='pval_direction')

    ## Add sample size for each study
    longpvals['sample_size'] = \
        longpvals.apply(lambda row: samplesizes.loc[row['study'], 'total'],
                        axis=1)

    ## Get the combined p-value using weighted stouffer's method
    metap = []
    for grp, subdf in longpvals.groupby(['otu', 'pval_direction']):
        # Only consider genera which are in more than one study
        if subdf.shape[0] > 1:
            # grp is the tuple that defines the group: (otu, direction)
            direction = grp[1]
            otu = grp[0]
            numstudies = subdf.shape[0]
            # Stouffer's weight z-score test
            z, p = combine_pvalues(subdf['value'].astype(float),
                                   method='stouffer',
                                   weights=subdf['sample_size'].apply(np.sqrt))
            metap.append([otu, direction, z, p, numstudies])
    metap = pd.DataFrame(metap, columns=['otu', 'direction', 'z', 'p', 'num_studies'])

    ## Count number of significant healthy and disease bugs
    # Note that from manual inspection, it doesn't look like any genera
    # are returned as significant in both directions from this method...
    sig_h = metap.query('direction == "p-h"').query('p < @qthresh')
    sig_dis = metap.query('direction == "p-dis"').query('p < @qthresh')

    return sig_h.shape[0] + sig_dis.shape[0]
项目:microbiomeHD    作者:cduvallet    | 项目源码 | 文件源码
def count_sig(allresults, qthresh=0.05):
    """
    Count how often bacteria are significant in each disease.

    Parameters
    ---------
    allresults : pandas dataframe
        datasets in columns, genera in rows, signed q-values in matrix (signed according to effect direction)
    qthresh : float
        significance threshold

    Returns
    -------
    meta_counts : pandas DataFrame
        Dataframe indicating how often each genus is significant in each
        diseases. Has columns ['otu', 'disease', 'significant',
        'num_times_sig', 'genus'], where:
            'significant' is [-1, 1]
            'num_time_sigs' is the number of times each otu is significant
                in each disease/direction combination
            'otu' is the full OTU name, 'genus' is just the genus
    """
    def thresh_map(x):
        # Note: in upstream steps, pvalues of 0 were converted to 1e-20
        # if x is zero because the effect was zero, np.sign(x) returns 0. So we're good.
        if abs(x) <= qthresh:
            return np.sign(x)
        elif abs(x) > qthresh:
            return 0
        else:
            return x

    meta_results = allresults.applymap(thresh_map)
    meta_results['otu'] = meta_results.index
    meta_results = pd.melt(meta_results, id_vars='otu',
                           var_name='dataset', value_name='significant')

    # Replace edd_singh with cdi_singh
    meta_results = meta_results\
        .replace('edd_singh', 'cdi_singh')\
        .replace('noncdi_schubert', 'cdi_schubert2')
    meta_results['disease'] = meta_results['dataset']\
        .apply(lambda x: x.split('_')[0])
    # Drop rows with either nan or 0 in 'significant' column (i.e. not significant, no effect)
    meta_results = meta_results.dropna()
    meta_results = meta_results.loc[meta_results['significant'].isin([-1,1])]

    # Get number of times each OTU is significant in each disease
    # (for each direction)
    meta_counts = meta_results.groupby(['otu', 'disease', 'significant']).size()
    meta_counts.name = 'num_times_sig'
    meta_counts = meta_counts.reset_index()
    meta_counts['genus'] = meta_counts['otu'].apply(lambda x: x.split(';')[-1])

    return meta_counts
项目:powerplantmatching    作者:FRESNA    | 项目源码 | 文件源码
def IRENA_stats():
    """
    Reads the IRENA Capacity Statistics 2017 Database
    """
    # Read the raw dataset
    df = pd.read_csv(_data_in('IRENA_CapacityStatistics2017.csv'), encoding='utf-8')
    # "Unpivot"
    df = pd.melt(df, id_vars=['Indicator', 'Technology', 'Country'], var_name='Year',
                 value_vars=[unicode(i) for i in range(2000,2017,1)], value_name='Capacity')
    # Drop empty
    df.dropna(axis=0, subset=['Capacity'], inplace=True)
    # Drop generations
    df = df[df.Indicator=='Electricity capacity (MW)']
    df.drop('Indicator', axis=1, inplace=True)
    # Drop countries out of scope
    df.Country.replace({'Czechia':u'Czech Republic',
                        'UK':u'United Kingdom'}, inplace=True)
    df = df[df.Country.isin(europeancountries())]
    # Convert to numeric
    df.Year = df.Year.astype(int)
    df.Capacity = df.Capacity.str.strip().str.replace(' ','').astype(float)
    # Handle Fueltypes and Technologies
    d = {u'Bagasse':'Bioenergy',
         u'Biogas':'Bioenergy',
         u'Concentrated solar power':'Solar',
         u'Geothermal':'Geothermal',
         u'Hydro 1-10 MW':'Hydro',
         u'Hydro 10+ MW':'Hydro',
         u'Hydro <1 MW':'Hydro',
         u'Liquid biofuels':'Bioenergy',
         u'Marine':'Hydro',
         u'Mixed and pumped storage':'Hydro',
         u'Offshore wind energy':'Wind',
         u'Onshore wind energy':'Wind',
         u'Other solid biofuels':'Bioenergy',
         u'Renewable municipal waste':'Bioenergy',
         u'Solar photovoltaic':'Solar'}
    df.loc[:,'Fueltype'] = df.Technology.map(d)
    d = {u'Concentrated solar power':'CSP',
         u'Solar photovoltaic':'PV',
         u'Onshore wind energy':'Onshore',
         u'Offshore wind energy':'Offshore'}
    df.Technology.replace(d, inplace=True)
    df.loc[:,'Set'] = 'PP'
    return df.reset_index(drop=True)
项目:SimMod    作者:hausfath    | 项目源码 | 文件源码
def run_simmod(run_start_year, run_end_year, dt, rcp, c_sens = c_sens, add_start = 0, 
               add_end = 0, c_add = 0, ch4_add = 0, n2o_add = 0):
    """
    Run the various parts of SimMod and export images and CSV files.
    """
    run_years = (run_end_year - run_start_year + 1)
    emission_vals = emissions(run_start_year, run_end_year, dt, rcp, 
                              add_start, add_end, c_add, ch4_add, n2o_add)
    conc = pulse_decay_runner(run_years, dt, emission_vals)

    if carbon_model == 'BEAM':
        beam._initial_carbon = np.array([596., 713., 35625.])
        beam.intervals = SUBSTEPS
        beam.time_step = dt
        beam.emissions = emission_vals['co2_pg'] / C_TO_CO2
        beam_results = pd.melt(beam.run()[0:1])
        conc['co2_ppm'] = beam_results['value'] * PGC_TO_MOL * 1e6 / MOLES_IN_ATMOSPHERE

    if carbon_model == 'box diffusion':
        box_diffusion_results = box_diffusion_model(
            emission_vals, 
            dt, 
            DZ, 
            MIXING
        )
        conc['co2_ppm'] = box_diffusion_results['co2ppm']

    if normalize_2000_conc == True:
        conc['co2_ppm'] = (
            conc['co2_ppm'] - 
            conc.loc[conc['year'] == 2000, 'co2_ppm'].min() +
            emission_vals.loc[emission_vals['year'] == 2000, 'rcp_co2_ppm'].min()
        )
        conc['ch4_ppb'] = (
            conc['ch4_ppb'] - 
            conc.loc[conc['year'] == 2000, 'ch4_ppb'].min() +
            emission_vals.loc[emission_vals['year'] == 2000, 'rcp_ch4_ppb'].min()
        )
        conc['n2o_ppb'] = (
            conc['n2o_ppb'] - 
            conc.loc[conc['year'] == 2000, 'n2o_ppb'].min() +
            emission_vals.loc[emission_vals['year'] == 2000, 'rcp_n2o_ppb'].min()
        )

    forcing = calc_radiative_forcing(conc)
    warming = continuous_diffusion_model(forcing, run_years, dt, c_sens)
    return warming
项目:flexmatcher    作者:biggorilla-gh    | 项目源码 | 文件源码
def create_training_data(self, dataframes, mappings, sample_size):
        """Transform dataframes and mappings into training data.

        The method uses the names of columns as well as the data under each
        column as its training data. It also replaces missing values with 'NA'.

        Args:
            dataframes (list): List of dataframes to train on.
            mapping (list): List of dictionaries mapping columns of dataframes
                to columns in the mediated schema.
            sample_size (int): The number of rows sampled from each dataframe
                for training.
        """
        train_data_list = []
        col_train_data_list = []
        for (datafr, mapping) in zip(dataframes, mappings):
            sampled_rows = datafr.sample(min(sample_size, datafr.shape[0]))
            sampled_data = pd.melt(sampled_rows)
            sampled_data.columns = ['name', 'value']
            sampled_data['class'] = \
                sampled_data.apply(lambda row: mapping[row['name']], axis=1)
            train_data_list.append(sampled_data)
            col_data = pd.DataFrame(datafr.columns)
            col_data.columns = ['name']
            col_data['value'] = col_data['name']
            col_data['class'] = \
                col_data.apply(lambda row: mapping[row['name']], axis=1)
            col_train_data_list.append(col_data)
        train_data = pd.concat(train_data_list, ignore_index=True)
        self.train_data = train_data.fillna('NA')
        self.col_train_data = pd.concat(col_train_data_list, ignore_index=True)
        self.col_train_data = \
            self.col_train_data.drop_duplicates().reset_index(drop=True)
        self.data_src_num = len(dataframes)
        self.columns = \
            sorted(list(set.union(*[set(x.values()) for x in mappings])))
        # removing columns that are not present in the dataframe
        # TODO: this should change (It's not ideal to change problem definition
        # without notifying the user)
        available_columns = []
        for (datafr, mapping) in zip(dataframes, mappings):
                for c in datafr.columns:
                    available_columns.append(mapping[c])
        self.columns = sorted(list(set(available_columns)))
项目:pygcam    作者:JGCRI    | 项目源码 | 文件源码
def plotForcingSubplots(tsdata, filename=None, ci=95, show_figure=False, save_fig_kwargs=None):
    sns.set_context('paper')
    expList = tsdata['expName'].unique()

    nrows = 1
    ncols = len(expList)
    width  = 2 * ncols
    height = 2
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=(width, height))

    def dataForExp(expName):
        df = tsdata.query("expName == '%s'" % expName).copy()
        df.drop(['expName'], axis=1, inplace=True)
        df = pd.melt(df, id_vars=['runId'], var_name='year')
        return df

    for ax, expName in zip(axes, expList):
        df = dataForExp(expName)

        pos = expName.find('-')
        title = expName[:pos] if pos >= 0 else expName
        ax.set_title(title.capitalize())

        tsm.tsplot(df, time='year', unit='runId', value='value', ci=ci, ax=ax)

        ylabel = 'W m$^{-2}$' if ax == axes[0] else ''
        ax.set_ylabel(ylabel)
        ax.set_xlabel('') # no need to say "year"
        ax.axhline(0, color='navy', linewidth=0.5, linestyle='-')
        plt.setp(ax.get_xticklabels(), rotation=270)

    plt.tight_layout()

    # Save the file
    if filename:
        if isinstance(save_fig_kwargs, dict):
            fig.savefig(filename, **save_fig_kwargs)
        else:
            fig.savefig(filename)

    # Display the figure
    if show_figure:
        plt.show()

    return fig
项目:fitbit-analyzer    作者:5agado    | 项目源码 | 文件源码
def _plotMonthlyStats(stats, columns, groupBy=True):
    dataToPlot = stats.copy()
    # Group by month and rename date column
    if groupBy:
        dataToPlot = dataToPlot.groupby(stats['date'].dt.month).mean()
        dataToPlot = dataToPlot.reset_index().rename(columns={'date': 'month'})

    # change stats from columns to row attribute
    dataToPlot = pd.melt(dataToPlot, id_vars=['month'], value_vars=columns,
                         var_name='stats', value_name='val')
    # Rename stats and weekdays
    dataToPlot['stats'].replace(NAMES, inplace=True)
    dataToPlot['month'].replace(months, inplace=True)
    order = [m for m in monthsOrder if m in dataToPlot['month'].unique()]
    # Plot
    g = sns.factorplot(data=dataToPlot, x="month", y="val", col="stats", order=order, kind="bar", sharey=False)
    g.set_xticklabels(rotation=45)
    g.set(xlabel='')
    return g
    #sns.plt.show()

# def _plotMonthlyStats(stats, columns):
#     """
#     Plot aggregated (mean) stats by month
#     :param stats: data to plot
#     :param columns: columns from stats to plot
#     """
#     MEASURE_NAME = 'month'
#     months={1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
#             9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
#     order = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
#     stats[MEASURE_NAME] = stats[MEASURE_NAME].map(months)
#
#     order = [m for m in order if m in stats[MEASURE_NAME].unique()]
#
#     f, axes = getAxes(2,2)
#     for i, c in enumerate(columns):
#         if c in NAMES:
#             c = NAMES[c]
#         g = sns.barplot(x=MEASURE_NAME, y=c, data=stats, order=order, ax=axes[i])
#         g.set_xlabel('')
#     sns.plt.show()
项目:crop-seq    作者:epigen    | 项目源码 | 文件源码
def gRNA_swarmplot(s1, s2, prefix=""):
    # Rank of gRNA change
    fig, axis = plt.subplots(3, 2, sharex=True, sharey=True, figsize=(8, 8))
    axis = axis.flatten()

    for i, screen in enumerate(s2.columns[::-1]):
        s = s1.join(s2)  # .fillna(0)
        s = s.iloc[np.random.permutation(len(s))]

        if ("TCR" in screen) or ("Jurkat" in screen) or ("stimulated" in screen) or ("unstimulated" in screen):
            s = s.ix[s.index[~s.index.str.contains("Wnt")]]
            if prefix.startswith("mid_screen-"):
                b = s["gDNA_Jurkat"]
            else:
                b = s["plasmid_pool_TCR"]
            x = s.ix[s.index[s.index.str.contains("Tcr")]]
            y = s.ix[s.index[s.index.str.contains("Essential")]]
            z = s.ix[s.index[s.index.str.contains("CTRL")]]
            b_x = b.ix[s.index[s.index.str.contains("Tcr")]]
            b_y = b.ix[s.index[s.index.str.contains("Essential")]]
            b_z = b.ix[s.index[s.index.str.contains("CTRL")]]
        elif ("WNT" in screen) or ("HEK" in screen):
            s = s.ix[s.index[~s.index.str.contains("Tcr")]]
            if prefix.startswith("mid_screen-"):
                if "_4_" in prefix:
                    b = s["gDNA_HEKclone4"]
                else:
                    b = s["gDNA_HEKclone6"]
            else:
                b = s["plasmid_pool_WNT"]
            x = s.ix[s.index[s.index.str.contains("Wnt")]]
            y = s.ix[s.index[s.index.str.contains("Essential")]]
            z = s.ix[s.index[s.index.str.contains("CTRL")]]
            b_x = b.ix[s.index[s.index.str.contains("Wnt")]]
            b_y = b.ix[s.index[s.index.str.contains("Essential")]]
            b_z = b.ix[s.index[s.index.str.contains("CTRL")]]

        fc_x = np.log2(1 + x[screen]) - np.log2(1 + b_x)
        fc_y = np.log2(1 + y[screen]) - np.log2(1 + b_y)
        fc_z = np.log2(1 + z[screen]) - np.log2(1 + b_z)

        fc_x.name = screen
        fc_y.name = "Essential"
        fc_z.name = "CTRL"

        sns.violinplot(x="variable", y="value", alpha=0.1, inner="box", data=pd.melt(pd.DataFrame([fc_x, fc_y, fc_z]).T), ax=axis[i])
        sns.swarmplot(x="variable", y="value", alpha=0.5, data=pd.melt(pd.DataFrame([fc_x, fc_y, fc_z]).T), ax=axis[i])
        axis[i].axhline(y=0, color='black', linestyle='--', lw=0.5)

        axis[i].set_title(screen)
    sns.despine(fig)
    fig.savefig(os.path.join(results_dir, "gRNA_counts.norm.{}.violin_swarmplot.svg".format(prefix)), bbox_inches="tight")