Python pandas 模块,cut() 实例源码

我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用pandas.cut()

项目:ScoreCardModel    作者:data-science-tools    | 项目源码 | 文件源码
def transform(self, x):
        """
        Parameters:

            x (Sequence): - ???????

        Returns:

            np.array: - ????????????numpy??

        """
        s = pd.cut(x, bins=self.bins)
        d = pd.get_dummies(s)
        z = d.T.to_dict()
        re = []
        for i, v in z.items():
            for j, u in v.items():
                if u == 1:
                    re.append(str(j))
        return np.array(re)
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def test_facet_wrap_expression():
    p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)')
    assert p == 'facet_wrap_expression'
项目:visualizations    作者:ContentMine    | 项目源码 | 文件源码
def update(attrname, old, new):
    new_selected, new_x_factors, new_y_factors = get_subset(dictionary_selector.value, dictionary_selector.value)
    bins = np.linspace(new_selected.counts.min(), new_selected.counts.max(), 10) # bin labels must be one more than len(colorpalette)
    new_selected["color"] = pd.cut(new_selected.counts, bins, labels = list(reversed(palettes.Blues9)), include_lowest=True)
    new_selected["wikidataID"] = new_selected["x"].map(lambda x: wikidataIDs.get(x))

    fig.xaxis.axis_label = dictionary_selector.value
    fig.yaxis.axis_label = dictionary_selector.value
    fig.title.text = "Top %d fact co-occurrences selected" % top_n.value

    src = ColumnDataSource(dict(
        x=new_selected["x"].astype(object),
        y=new_selected["y"].astype(object),
        color=new_selected["color"].astype(object),
        wikidataID=new_selected["wikidataID"],
        counts=new_selected["counts"].astype(int),
        raw=new_selected["raw"].astype(int)))
    source.data.update(src.data)

    fig.x_range.update(factors=new_x_factors[:top_n.value])
    fig.y_range.update(factors=new_y_factors[:top_n.value])
项目:SFBIStats    作者:royludo    | 项目源码 | 文件源码
def plot_tendencies(word_list, pos_dic, bin_size, output_dir, file_name):
    plt.figure()
    dataframe_list = list()
    for word in word_list:
        if word not in pos_dic:
            raise Exception('Word ' + word + ' not found')
        df = pd.DataFrame(pos_dic[word], columns=['pos'])
        df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
        df = df.groupby(['bins'])['bins'].count()
        dataframe_list.append(df)

    df_final = pd.DataFrame(pd.concat(dataframe_list, axis=1)).fillna(0)
    df_final.columns = word_list
    ax = df_final.plot()
    ax.set_xlabel("Position (en % de la longueur de la description)")
    ax.set_ylabel("Nombre d'occurrences")
    plt.title('Position des mots dans les descriptions des offres', y=1.08)
    plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
项目:clchoropleth    作者:slarrain    | 项目源码 | 文件源码
def discretize(data, bins=5, quantile=False):
    '''
    Creates 'bins' number of bins and discretizes the data.
    Uses cut function by default. qcut function otherwise.
    '''
    if quantile:
        new_data = pd.qcut(data, bins, labels=list(range(bins)))
    else:
        new_data = pd.cut(data, bins, labels=list(range(bins)))
    return new_data
项目:bubble_plot    作者:shirmeir    | 项目源码 | 文件源码
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                         pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
    count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
    count_table = count_table.unstack()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    z_boolean_values = count_table_long[z_boolean].unique()
    ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
    count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
    count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
    size_factor = maximal_bubble_size/count_table_long['value'].max()
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict)
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['ratio'],  alpha=0.5,
                cmap='cool')
    return count_table_long, xticks, yticks, xticklabels, yticklabels
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def test_probabilities(model: ClassifierMixin, X: np.array, y: pd.Series,
                       bins: int = 10, threshold: float = 0.5):
    """Print confusion matrix based on class probability."""
    probs = [p[1] for p in model.predict_proba(X)]
    print('\tProbabilities')
    df = pd.DataFrame({'prob': probs, 'label': y})
    step = 1 / bins
    cut_labels = [round(step * f, 1) for f in range(10)]
    by_prob = (df.groupby(pd.cut(df['prob'], bins, labels=cut_labels))
                 .agg(['sum', 'count'])['label'])
    print('\t\tprobs\t1\t0\tacc')
    for index, row in by_prob.iloc[::-1].iterrows():
        ones = row['sum']
        if math.isnan(ones):
            ones = 0
        else:
            ones = int(ones)
        count = row['count']
        zeros = int(count) - ones
        if count > 0:
            acc = zeros / count if index < threshold else ones / count
        else:
            acc = 0.0
        print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
项目:dsbox-cleaning    作者:usc-isi-i2    | 项目源码 | 文件源码
def _discretize_by_width(col, num_bins, labels):
    maxvalue = col.max()
    minvalue = col.min()
    width = float((maxvalue-minvalue))/num_bins
    bins = [minvalue + x*width for x in range(num_bins)]+[maxvalue]
    if labels:
        if len(labels)!=num_bins:
            raise ValueError('Length of assigned labels not consistent with num_bins!')
        else:
            group_names = labels
    else:
        group_names = range(num_bins)
    return pd.cut(col, bins,labels=group_names, include_lowest=True)
项目:dsbox-cleaning    作者:usc-isi-i2    | 项目源码 | 文件源码
def _discretize_by_frequency(col, num_bins, labels):
    percent = 1.0/num_bins
    bins = sorted(list(set(col.quantile([x*percent for x in range(num_bins+1)]))))
    if len(bins)-1 < num_bins:
        num_bins = len(bins)-1
        print('...Only %d bins (unbalanced) generated due to overlapping percentile boundaries.'%num_bins)
    if labels:
        if len(labels)!=num_bins:
            raise ValueError('Length of assigned labels not consistent with num_bins!')
        else:
            group_names = labels
    else:
        group_names = range(num_bins)
    return pd.cut(col, bins,labels=group_names, include_lowest=True)
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def compute_group(cls, data, scales, **params):
        bins = params['bins']
        breaks = params['breaks']
        binwidth = params['binwidth']
        boundary = params['boundary']

        func = make_summary_fun(params['fun_data'], params['fun_y'],
                                params['fun_ymin'], params['fun_ymax'],
                                params['fun_args'])

        breaks = fuzzybreaks(scales.x, breaks, boundary, binwidth, bins)
        data['bin'] = pd.cut(data['x'], bins=breaks, labels=False,
                             include_lowest=True)

        def func_wrapper(data):
            """
            Add `bin` column to each summary result.
            """
            result = func(data)
            result['bin'] = data['bin'].iloc[0]
            return result

        # This is a plyr::ddply
        out = groupby_apply(data, 'bin', func_wrapper)
        centers = (breaks[:-1] + breaks[1:]) * 0.5
        bin_centers = centers[out['bin'].values]
        out['x'] = bin_centers
        out['bin'] += 1
        if isinstance(scales.x, scale_discrete):
            out['width'] = 0.9
        else:
            out['width'] = np.diff(breaks)[bins-1]

        return out
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def test_facet_grid_expression():
    p = g + facet_grid(
        ['var2', 'pd.cut(var1, (0, 2, 4), include_lowest=True)'])
    assert p == 'facet_grid_expression'
项目:SFBIStats    作者:royludo    | 项目源码 | 文件源码
def plot_tendency(word, pos_dic, bin_size, output_dir, file_name):
    plt.figure()
    if word not in pos_dic:
        raise Exception('Word ' + word + ' notfound')

    df = pd.DataFrame(pos_dic[word], columns=['pos'])  # .groupby(['pos'])['pos'].count()
    df['bins'] = pd.cut(df['pos'], bins=range(0, 100 + bin_size, bin_size), labels=range(0, 100, bin_size))
    df = df.groupby(['bins'])['bins'].count()
    ax = df.plot(title="Position du mot '" + word + "' dans les descriptions des offres")
    ax.set_xlabel("Position (en % de la longueur de la description)")
    ax.set_ylabel("Nombre d'occurrences")
    plt.savefig(os.path.join(output_dir, file_name), bbox_inches='tight')
项目:motif-classify    作者:macks22    | 项目源码 | 文件源码
def symbolize(self, xs):
        """
        Symbolize a PPA
        """
        alphabet_sz = len(self.alphabet)
        cutpoints = self.cutpoints[alphabet_sz]
        return pd.cut(xs, bins = cutpoints, labels = self.alphabet)
项目:polara    作者:Evfro    | 项目源码 | 文件源码
def is_not_uniform(idx, nbins=10, allowed_gap=0.75):
        idx_bins = pd.cut(idx, bins=nbins, labels=False)
        idx_bin_size = np.bincount(idx_bins)

        diff = idx_bin_size[:-1] - idx_bin_size[1:]
        monotonic = (diff < 0).all() or (diff > 0).all()
        huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap
        return monotonic or huge_gap
项目:berrl    作者:murphy214    | 项目源码 | 文件源码
def make_object_map(data,field,**kwargs):
    linear = False
    for key,value in kwargs.iteritems():
        if key == 'linear':
            linear = value
    print linear
    if linear == False:
        colors,rangelist = make_distributed_range(data,field)
    else:
        colors = get_heatmap51()
        colors2 = colors 
        maxvalue = data[field].max()
        if maxvalue < 51:
            totallist = range(maxvalue)
            colors = reduce_color_list_size(totallist,colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
        else:
            colors = reduce_color_list_size(range(len(data)),colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
            if not rangelist[0] == 0:
                rangelist = [0] + rangelist[1:]
            data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)

            return data
    colors2 = get_heatmap51()
    if not rangelist[0] == 0:
        rangelist = [0] + rangelist[1:]
    data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])

    return data

# for a given dataframe and field returns a non used grouped object to multiple operations on
项目:berrl    作者:murphy214    | 项目源码 | 文件源码
def make_object_map(data,field,**kwargs):
    linear = False
    for key,value in kwargs.iteritems():
        if key == 'linear':
            linear = value
    print linear
    if linear == False:
        colors,rangelist = make_distributed_range(data,field)
    else:
        colors = get_heatmap51()
        colors2 = colors 
        maxvalue = data[field].max()
        if maxvalue < 51:
            totallist = range(maxvalue)
            colors = reduce_color_list_size(totallist,colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
        else:
            colors = reduce_color_list_size(range(len(data)),colors)
            colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
            if not rangelist[0] == 0:
                rangelist = [0] + rangelist[1:]
            data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)

            return data
    colors2 = get_heatmap51()
    if not rangelist[0] == 0:
        rangelist = [0] + rangelist[1:]
    data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])

    return data

# for a given dataframe and field returns a non used grouped object to multiple operations on
项目:ReducedVarianceReparamGradients    作者:andymiller    | 项目源码 | 文件源码
def process_dataset():
    data_dir = os.path.dirname(__file__)
    df = pd.read_csv(os.path.join(data_dir, 'data/frisk/frisk_with_noise.dat'), skiprows=6, delim_whitespace=True)

    # compute proportion black in precinct, black = 1
    # first aggregate by precinct/ethnicity, and sum over populations
    popdf = df[['pop', 'precinct', 'eth']]. \
                groupby(['precinct', 'eth'])['pop'].apply(sum)
    percent_black = np.array([ popdf[i][1] / float(popdf[i].sum())
                               for i in xrange(1, 76)] )
    precinct_type = pd.cut(percent_black, [0, .1, .4, 1.])  #
    df['precinct_type'] = precinct_type.codes[df.precinct.values-1]
    return df
项目:PyPSA    作者:PyPSA    | 项目源码 | 文件源码
def busmap_by_rectangular_grid(buses, divisions=10):
    busmap = pd.Series(0, index=buses.index)
    if isinstance(divisions, tuple):
        divisions_x, divisions_y = divisions
    else:
        divisions_x = divisions_y = divisions
    gb = buses.groupby([pd.cut(buses.x, divisions_x), pd.cut(buses.y, divisions_y)])
    for nk, oks in enumerate(itervalues(gb.groups)):
        busmap.loc[oks] = nk
    return busmap
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_groupby_categorical_unequal_len(self):
        # GH3011
        series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
        # The raises only happens with categorical, not with series of types
        # category
        bins = pd.cut(series.dropna().values, 4)

        # len(bins) != len(series) here
        self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def setUp(self):
        self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c',
                                              'c', 'c'])

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False,
                                   labels=labels)
        self.cat = df
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_series_functions_no_warnings(self):
        df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
        labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
        with tm.assert_produces_warning(False):
            df['group'] = pd.cut(df.value, range(0, 105, 10), right=False,
                                 labels=labels)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_assignment_to_dataframe(self):
        # assignment
        df = DataFrame({'value': np.array(
            np.random.randint(0, 10000, 100), dtype='int32')})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]

        df = df.sort_values(by=['value'], ascending=True)
        s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
        d = s.values
        df['D'] = d
        str(df)

        result = df.dtypes
        expected = Series(
            [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D'])
        tm.assert_series_equal(result, expected)

        df['E'] = s
        str(df)

        result = df.dtypes
        expected = Series([np.dtype('int32'), com.CategoricalDtype(),
                           com.CategoricalDtype()],
                          index=['value', 'D', 'E'])
        tm.assert_series_equal(result, expected)

        result1 = df['D']
        result2 = df['E']
        self.assertTrue(result1._data._block.values.equals(d))

        # sorting
        s.name = 'E'
        self.assertTrue(result2.sort_index().equals(s.sort_index()))

        cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
        df = pd.DataFrame(pd.Series(cat))
项目:bubble_plot    作者:shirmeir    | 项目源码 | 文件源码
def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10,
                bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=4000,
                normalization_by_all = False, log=False):
    """
    :param df: dataframe
    :param x:  name of first numerical/categorical field (string) (for x-axis)
    :param y: name of second numerical/categorical field (string) (for y-axis)
    :param z_boolean: name of categorical field with two categories / boolean field (for coloring)
    :param ordered_x_values: the values we would like to map from x categorical variable 
    according to the order we would like to present them
    :param ordered_y_values: the values we would like to map from the y categorical variable 
    according to the order we would like to present them
    :param bins_x: the bins for x values if x is numberic
    :param bins_y: the bins for y values if y is numberic
    :param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x)
    :param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change!
    :param log: whether to apply log on the count (influence the size of the bubbles)
    :return: nice bubble plot, bubble size is propotional to the frequency of the bucket :)
    """
    plt.figure(figsize=figsize)
    x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None
    y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None 
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                             pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1)
    count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0)
    ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values
    ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values
    if z_boolean is not None:
        count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size, 
                                                                                 normalization_by_all=normalization_by_all)
    else:
        count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, 
                                                                                    normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size )
    plt.xticks(xticks, xticklabels,fontsize=fontsize)
    plt.yticks(yticks, yticklabels,fontsize=fontsize)
    plt.xlabel(x, fontsize=fontsize)
    plt.ylabel(y, fontsize=fontsize)
    if z_boolean is None:
        plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4);
    else:
        plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);
项目:Mmodel    作者:gxrtbf    | 项目源码 | 文件源码
def transform_with_woe(model_data):

    cut_point = model_config.logistic_cut
    for key in cut_point.keys():
        cutss = cut_point[key]['cut_point']
        wwoe = cut_point[key]['woe']
        model_data[key] = pd.cut(model_data[key],bins=cutss,labels=range(len(cutss) - 1)).map(lambda x:wwoe[x])

    return model_data
项目:snape    作者:mbernico    | 项目源码 | 文件源码
def create_categorical_features(df, label_list, random_state=None):
    """
    Creates random categorical variables

    :param df: data frame we're operation on
    :param label_list: A list of lists, each list is the labels for one categorical variable
    :param random_state: the numpy RandomState
    :return: A modified dataframe

    Example:

    create_categorical_features(df, [['a','b'], ['red','blue']])

    """
    random_state = get_random_state(random_state)

    df = df.copy()
    n_categorical = len(label_list)

    # get numeric columns ONCE so we don't have to do it every time we loop:
    numer_cols = [col for col in df.select_dtypes(include=['number']).columns if col != 'y']

    for i in range(0, n_categorical):
        # we might be out of numerical columns!
        if not numer_cols:
            break

        # chose a random numeric column that isn't y
        chosen_col = random_state.choice(numer_cols)
        # pop the chosen_col out of the numer_cols
        numer_cols.pop(numer_cols.index(chosen_col))

        # use cut to convert that column to categorical
        df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i])

    return df
项目:pygcam    作者:JGCRI    | 项目源码 | 文件源码
def binColumns(inputDF, bins=DEFAULT_BIN_COUNT):
    columns = inputDF.columns
    binned = pd.DataFrame(columns=columns)
    for col in columns:
        s = inputDF[col]
        binned[col] = pd.cut(s, bins, labels=False)

    return binned

# TBD: Finish refactoring this
项目:fx    作者:TaRyu    | 项目源码 | 文件源码
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = data.reshape(-1, 24)
    data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)])
    data_s = {
        'open_price': np.array([data[i][0][0]
                                for i in range(data.shape[0] - 1)]),
        'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23]
                                 for i in range(data.shape[0] - 1)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 1)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 1)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 1)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 1)]),
        'buy_or_sell': np.array(
            [int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0])
             for i in range(data.shape[0] - 1)]),
        'change': np.array(
            [(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) /
             data[i + 1][int(NUM_PIX / 24) - 1][23] * 100
             for i in range(data.shape[0] - 1)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    data = data.reshape(len(data), NUM_PIX)
    np.save(file_out[0], data[:len(data) - 1])
    data_s.to_pickle(file_out[1])
项目:fx    作者:TaRyu    | 项目源码 | 文件源码
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
    data = data.reshape(-1, 576)
    data_s = {
        'open_price': np.array([data[i][0]
                                for i in range(data.shape[0] - 576)]),
        'close_price': np.array([data[i][575]
                                 for i in range(data.shape[0] - 576)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 576)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 576)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 576)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 576)]),
        'buy_or_sell': np.array(
            [int(data[i + 576][575] > data[i + 576][0])
             for i in range(data.shape[0] - 576)]),
        'change': np.array(
            [(data[i + 576][575] - data[i + 576][0]) /
             data[i + 576][575] * 100
             for i in range(data.shape[0] - 576)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    np.save(file_out[0], data[:len(data) - 576])
    data_s.to_pickle(file_out[1])
项目:fx    作者:TaRyu    | 项目源码 | 文件源码
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = np.array([data[i:i + 576] for i in range(data.shape[0] - 576 + 1)])
    data = data.reshape(-1, 576)
    data_s = {
        'open_price': np.array([data[i][0]
                                for i in range(data.shape[0] - 576)]),
        'close_price': np.array([data[i][575]
                                 for i in range(data.shape[0] - 576)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 576)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 576)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 576)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 576)]),
        'buy_or_sell': np.array(
            [int(data[i + 576][575] > data[i + 576][0])
             for i in range(data.shape[0] - 576)]),
        'change': np.array(
            [(data[i + 576][575] - data[i + 576][0]) /
             data[i + 576][575] * 100
             for i in range(data.shape[0] - 576)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    bins = [0.01 * x for x in bins]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    np.save(file_out[0], data[:len(data) - 576])
    data_s.to_pickle(file_out[1])
项目:anonymisation    作者:SGMAP-AGD    | 项目源码 | 文件源码
def period_by_hours(x, separation):
    ''' aggrege le x par intervale d'heure.
        Le calcul pourrait être simple si on interdisait
        le chevauchement de jour.
    '''
    print(separation)
    assert isinstance(separation, list)
    assert all([sep < 24 for sep in separation])
    separation.sort()

    if 0 in separation:
        separation.append(24)
        hour_categ = pd.cut(x.dt.hour, separation, right=False)
        date_categ = x.dt.date
        return date_categ.astype(str) + ' ' + hour_categ.astype(str)
    else:
        hour = x.dt.hour
        hour_categ = pd.cut(hour, separation, right=False).astype(str)
        night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')'
        hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ
        assert hour_categ.nunique(dropna=False) == len(separation)
        date_categ = x.dt.date.astype(str)
        # décalage d'un jour pour les premières heures
        decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1)
        date_categ[x.dt.hour < separation[1]] = decale.astype(str)
        assert all(date_categ.str.len() == 10)
        return date_categ + ' ' + hour_categ


### 4 - special
项目:guacml    作者:guacml    | 项目源码 | 文件源码
def predictions_vs_actual_classification(model_results, model_name, n_bins, figsize=(7, 3)):
    holdout = model_results.holdout_data
    target = model_results.target
    bins = np.arange(0, 1.001, 1 / n_bins)
    bin_mids = (bins[:-1] + bins[1:]) / 2
    binned = pd.cut(holdout['prediction'], bins=bins)
    bin_counts = holdout.groupby(binned)[target].count()
    bin_means = holdout.groupby(binned)[target].mean()

    fig = plt.figure(figsize=figsize)
    plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14)
    ax1 = plt.gca()
    ax1.grid(False)
    ax1.bar(bin_mids, bin_counts, width=1/n_bins, color=sns.light_palette('green')[1],
            label='row count', edgecolor='black')
    ax1.set_xlabel('predicted probability')
    ax1.set_ylabel('row count')

    ax2 = ax1.twinx()
    ax2.plot(bin_mids, bin_means, linewidth=3,
             marker='.', markersize=16, label='actual rate')
    ax2.plot(bins, bins, color=sns.color_palette()[2], label='main diagonal')

    ax2.set_ylabel('actual rate')

    handles, labels = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
    legend = plt.legend(handles + handles2, labels + labels2,
                        loc='best',
                        frameon=True,
                        framealpha=0.7)
    frame = legend.get_frame()
    frame.set_facecolor('white')
    return fig
项目:tflearn    作者:tflearn    | 项目源码 | 文件源码
def prepare_input_data(self, input_data, name="", category_map=None):
        '''
        Prepare input data dicts
        '''
        print ("-"*40 + " Preparing %s" % name)
        X = input_data[self.continuous_columns].values.astype(np.float32)
        Y = input_data[self.label_column].values.astype(np.float32)
        Y = Y.reshape([-1, 1])
        if self.verbose:
            print ("  Y shape=%s, X shape=%s" % (Y.shape, X.shape))

        X_dict = {"wide_X": X}

        if 'deep' in self.model_type:
            # map categorical value strings to integers
            td = input_data
            if category_map is None:
                category_map = {}
                for cc in self.categorical_columns:
                    if not cc in td.columns:
                        continue
                    cc_values = sorted(td[cc].unique())
                    cc_max = 1+len(cc_values)
                    cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
                    if self.verbose:
                        print ("  category %s max=%s,  map=%s" % (cc, cc_max, cc_map))
                    category_map[cc] = cc_map

            td = td.replace(category_map)

            # bin ages (cuts off extreme values)
            age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
            td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
            td = td.replace({'age_binned': {np.nan: 0}})
            print ("  %d age bins: age bins = %s" % (len(age_bins), age_bins))

            X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})

        Y_dict = {"Y": Y}
        if self.verbose:
            print ("-"*40)
        return X_dict, Y_dict, category_map
项目:CustomerSim    作者:sisl    | 项目源码 | 文件源码
def discretize(data, vars_to_discretize, n_bins):

    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and 
    a dictionary containing the number of bins for selected variables.

    Returns data after selected variables have been discretized, 
    together with binning definition for each variable.
    '''

    data_subset = ps.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:

        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal': 
            out, binning = ps.cut(data_subset.ix[:,i],bins=n_bins[i],labels=False,retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = ps.qcut(data_subset.ix[:,i],q=nb,labels=False,retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:,i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:,i] = out

        # replace NA variables with and special index (1+max) - 
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:,i][data_subset.ix[:,i].isnull()] = data_subset.ix[:,i].max() + 1
        bins[i] = binning

    return data_subset, bins
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_groupby_categorical_two_columns(self):

        # https://github.com/pydata/pandas/issues/8138
        d = {'cat':
             pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
                            ordered=True),
             'ints': [1, 1, 2, 2],
             'val': [10, 20, 30, 40]}
        test = pd.DataFrame(d)

        # Grouping on a single column
        groups_single_key = test.groupby("cat")
        res = groups_single_key.agg('mean')
        exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
                        index=pd.CategoricalIndex(["a", "b", "c"], name="cat"))
        tm.assert_frame_equal(res, exp)

        # Grouping on two columns
        groups_double_key = test.groupby(["cat", "ints"])
        res = groups_double_key.agg('mean')
        exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
                         "cat": ["a", "a", "b", "b", "c", "c"],
                         "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
                                                                 ])
        tm.assert_frame_equal(res, exp)

        # GH 10132
        for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
            c, i = key
            result = groups_double_key.get_group(key)
            expected = test[(test.cat == c) & (test.ints == i)]
            assert_frame_equal(result, expected)

        d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
        test = pd.DataFrame(d)
        values = pd.cut(test['C1'], [1, 2, 3, 6])
        values.name = "cat"
        groups_double_key = test.groupby([values, 'C2'])

        res = groups_double_key.agg('mean')
        nan = np.nan
        idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"],
                                       [1, 2, 3, 4]],
                                      names=["cat", "C2"])
        exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
                                nan, nan, nan, nan, 4, 5],
                         "C3": [nan, nan, nan, nan, 10, 100,
                                nan, nan, nan, nan, 200, 34]}, index=idx)
        tm.assert_frame_equal(res, exp)
项目:TensorFlowFlask    作者:PythonWorkshop    | 项目源码 | 文件源码
def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"):
        column_list = training_df.columns.tolist()
        threshold = 5

        red_wine_cleaned = training_df.copy()
        red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1])

        # Bin the data
        bins = [3, 5, 6, 8]
        red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'],
                                              include_lowest=True)

        # Only include 'Bad' and 'Good' categories
        red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy()

        bins = [3, 5, 8]
        red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality,
                                              bins, labels=['Bad', 'Good'], include_lowest=True)

        y_red_wine = red_wine_newcats[['category']].get_values()

        # Removing fixed_acidity and quality
        X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values()

        y_red_wine_raveled = y_red_wine.ravel()
        y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled]
        y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers]
        y_red_wine_integers = [np.int(y) for y in y_red_wine_integers]

        y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2)

        X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42)
        # model

        with tf.variable_scope("softmax_regression"):
            X = tf.placeholder("float", [None, 10])
            y, variables = softmax_regression(X)

        # train
        y_ = tf.placeholder("float", [None, 2])
        cost = -tf.reduce_mean(y_ * tf.log(y))
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        init = tf.initialize_all_variables()
        self.sess.run(init)
        for i in range(100):
            average_cost = 0
            number_of_batches = int(len(X_train) / batch_size)
            for start, end in zip(range(0, len(X_train), batch_size), range(batch_size, len(X_train), batch_size)):
                self.sess.run(optimizer, feed_dict={X: X_train[start:end], y_: y_train[start:end]})
                # Compute average loss
                average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end],
                                              y_: y_train[start:end]}) / number_of_batches
            print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test}))

        filename = "data/softmax_regression.ckpt"
        path = self.save_locally(filename)
        self.save_to_s3(path, model_name)
        print("Saved:", path)
项目:poreduck    作者:alexiswl    | 项目源码 | 文件源码
def plot_yield_by_quality():
    # Close any previous plots
    plt.close('all')
    # Read in seqlength and time from ALL_READS dataframe
    new_yield_data = ALL_READS[['time', "seq_length", "av_qual"]]
    # Bin qualities
    qual_bins = [0] + QUALITY_BINS + [new_yield_data["av_qual"].max()]
    # Cut yield data into quality bins
    new_yield_data["descriptive_quality"] = pd.cut(new_yield_data["av_qual"], qual_bins,
                                                   labels=[description
                                                           for description in reversed(QUALITY_DESCRIPTIONS)])
    # Time as index and drop av_qual column
    new_yield_data.set_index(pd.DatetimeIndex(new_yield_data['time']), inplace=True)
    new_yield_data.drop('av_qual', axis=1, inplace=True)
    # Obtain cumulative sum by quality bin in each minute.
    yield_data_grouped = new_yield_data.groupby("descriptive_quality").apply(lambda d: d.resample("1T").sum().fillna(0))['seq_length']
    # Create a dict of dataframes based on groups.
    yield_data_by_quality = {description: yield_data_grouped[description].to_frame().reset_index()
                             for description in
                             QUALITY_DESCRIPTIONS}

    for description, yield_df in yield_data_by_quality.items():
        yield_df.reset_index(inplace=True)
        yield_df.set_index("time", inplace=True)
        yield_df = yield_df.reindex(index=YIELD_DATA.time, fill_value=0)
        yield_df.reset_index(inplace=True)
        # Generate a cumulative sum of sequence data
        yield_df['cumsum_bp'] = yield_df['seq_length'].cumsum()
        # Convert time to timedelta format and then to float format, in hours.
        yield_df['duration_tdelta'] = yield_df['time'].apply(lambda t: t - yield_df['time'].min())
        yield_df['duration_float'] = yield_df['duration_tdelta'].apply(lambda t: t.total_seconds() / 3600)
        yield_data_by_quality[description] = yield_df

    # Set subplots.
    fig, ax = plt.subplots(1)
    # Create ticks using numpy linspace. Ideally will create 6 points between 0 and 48 hours.
    num_points = 7  # Need to include zero point
    x_ticks = np.linspace(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max(), num_points)
    ax.set_xticks(x_ticks)
    # Define axis formatters
    ax.yaxis.set_major_formatter(FuncFormatter(y_yield_to_human_readable))
    ax.xaxis.set_major_formatter(FuncFormatter(x_yield_to_human_readable))
    # Set x and y labels and title.
    ax.set_xlabel("Duration (HH:MM)")
    ax.set_ylabel("Yield")
    ax.set_title(f"Yield for {SAMPLE_NAME} over time by quality")
    ax.stackplot(YIELD_DATA['duration_float'],
                 [yield_data_by_quality[description]['cumsum_bp']
                  for description in QUALITY_DESCRIPTIONS],
                 colors=QUALITY_COLOURS)
    # Limits must be set after the plot is created
    ax.set_xlim(YIELD_DATA['duration_float'].min(), YIELD_DATA['duration_float'].max())
    ax.set_ylim(ymin=0)

    # Add legend to plot.
    ax.legend([mpatches.Patch(color=colour)
               for colour in QUALITY_COLOURS],
              QUALITY_DESCRIPTIONS, loc=2)
    # Ensure labels are not missed.
    fig.tight_layout()
    savefig(os.path.join(PLOTS_DIR, f"{SAMPLE_NAME.replace(' ', '_')}_yield_plot_by_quality.png"))
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def titanic_1():
    titanic = sns.load_dataset('titanic')
    print titanic.head()
    #    survived  pclass     sex   age  ......
    #           0       0    male    22
    # 1         1       1  female  38.0
    # 2         1       3  female  26.0
    # 3         1       1  female  35.0
    # 4         0       3    male  35.0

    print titanic.groupby('sex')[['survived']].mean()
    #         survived
    # sex
    # female  0.742038
    # male    0.188908

    print titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
    # class      First    Second     Third
    # sex
    # female  0.968085  0.921053  0.500000
    # male    0.368852  0.157407  0.135447

    print titanic.pivot_table('survived', index='sex', columns='class')
    # class      First    Second     Third
    # sex
    # female  0.968085  0.921053  0.500000
    # male    0.368852  0.157407  0.135447

    age = pd.cut(titanic['age'], [0, 18, 80])
    print titanic.pivot_table('survived', ['sex', age], 'class')
    # class               First    Second     Third
    # sex    age
    # female (0, 18]   0.909091  1.000000  0.511628
    #        (18, 80]  0.972973  0.900000  0.423729
    # male   (0, 18]   0.800000  0.600000  0.215686
    #        (18, 80]  0.375000  0.071429  0.133663

    print titanic.pivot_table(index='sex', columns='class',
                              aggfunc={'survived': sum, 'fare': 'mean'})

    print titanic.pivot_table('survived', index='sex', columns='class', margins=True)
    # class      First    Second     Third       All
    # sex
    # female  0.968085  0.921053  0.500000  0.742038
    # male    0.368852  0.157407  0.135447  0.188908
    # All     0.629630  0.472826  0.242363  0.383838
项目:tflearn_wide_and_deep    作者:ichuang    | 项目源码 | 文件源码
def prepare_input_data(self, input_data, name="", category_map=None):
        '''
        Prepare input data dicts
        '''
        print ("-"*40 + " Preparing %s" % name)
        X = input_data[self.continuous_columns].values.astype(np.float32)
        Y = input_data[self.label_column].values.astype(np.float32)
        Y = Y.reshape([-1, 1])
        if self.verbose:
            print ("  Y shape=%s, X shape=%s" % (Y.shape, X.shape))

        X_dict = {"wide_X": X}

        if 'deep' in self.model_type:
            # map categorical value strings to integers
            td = input_data
            if category_map is None:
                category_map = {}
                for cc in self.categorical_columns:
                    if not cc in td.columns:
                        continue
                    cc_values = sorted(td[cc].unique())
                    cc_max = 1+len(cc_values)
                    cc_map = dict(zip(cc_values, range(1, cc_max))) # start from 1 to avoid 0:0 mapping (save 0 for missing)
                    if self.verbose:
                        print ("  category %s max=%s,  map=%s" % (cc, cc_max, cc_map))
                    category_map[cc] = cc_map

            td = td.replace(category_map)

            # bin ages (cuts off extreme values)
            age_bins = [ 0, 12, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80, 65535 ]
            td['age_binned'] = pd.cut(td['age'], age_bins, labels=False)
            td = td.replace({'age_binned': {np.nan: 0}})
            print ("  %d age bins: age bins = %s" % (len(age_bins), age_bins))

            X_dict.update({ ("%s_in" % cc): td[cc].values.astype(np.int32).reshape([-1, 1]) for cc in self.categorical_columns})

        Y_dict = {"Y": Y}
        if self.verbose:
            print ("-"*40)
        return X_dict, Y_dict, category_map