Python pandas 模块,DataFrame() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.DataFrame()

项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def aggregate_ohlcv_panel(self,
                              fields,
                              ohlcv_panel,
                              items=None,
                              minor_axis=None):
        """
        Convert an OHLCV Panel into a DataFrame by aggregating each field's
        frame into a Series.
        """
        vals = ohlcv_panel
        if isinstance(ohlcv_panel, pd.Panel):
            vals = ohlcv_panel.values
            items = ohlcv_panel.items
            minor_axis = ohlcv_panel.minor_axis

        data = [
            self.frame_to_series(
                field,
                vals[items.get_loc(field)],
                minor_axis
            )
            for field in fields
        ]
        return np.array(data)
项目:sensu_drive    作者:ilavender    | 项目源码 | 文件源码
def y_sum_by_time(x_arr, y_arr, top=None):
    df = pd.DataFrame({'Timestamp': pd.to_datetime(x_arr, unit='s'), 'Status': y_arr})
    df['Date'] = df['Timestamp'].apply(lambda x: "%d/%d/%d" % (x.day, x.month, x.year))
    df['Hour'] = df['Timestamp'].apply(lambda x: "%d" % (x.hour))
    df['Weekday'] = df['Timestamp'].apply(lambda x: "%s" % (x.weekday_name))

    times = ['Hour', 'Weekday', 'Date']

    result = {}

    for groupby in times:

        df_group = df.groupby(groupby, as_index=False).agg({'Status': np.sum})

        if top != None and top > 0:
            #df_group = df_group.nlargest(top, 'Status').sort(['Status', 'Hour'],ascending=False)
            idx = df_group.nlargest(top, 'Status') > 0
        else:
            idx = df_group['Status'].max() == df_group['Status']

        result[groupby] = {k: g['Status'].replace(np.nan, 'None').tolist() for k,g in df_group[idx].groupby(groupby)}

    return result
项目:manubot    作者:greenelab    | 项目源码 | 文件源码
def get_citation_df(args, text):
    """
    Generate citation_df and save it to 'citations.tsv'.
    """
    citation_df = pandas.DataFrame(
        {'string': get_citation_strings(text)}
    )
    if args.citation_tags_path.is_file():
        tag_df = pandas.read_table(args.citation_tags_path)
        tag_df['string'] = '@tag:' + tag_df.tag
        for citation in tag_df.citation:
            is_valid_citation_string('@' + citation)
        citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
    else:
        citation_df['citation'] = None
        logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
    citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
    citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
    citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
    citation_df = citation_df.sort_values(['standard_citation', 'citation'])
    citation_df.to_csv(args.citations_path, sep='\t', index=False)
    check_collisions(citation_df)
    check_multiple_citation_strings(citation_df)
    return citation_df
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
#    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].genes)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    return (1 / np.sum(f1))

# Main
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def rhoA(self):
        # rhoA
        rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent)

        for i in range(self.lenlatent):
            weights = pd.DataFrame(self.outer_weights[self.latent[i]])
            weights = weights[(weights.T != 0).any()]
            result = pd.DataFrame.dot(weights.T, weights)
            result_ = pd.DataFrame.dot(weights, weights.T)

            S = self.data_[self.Variables['measurement'][
                self.Variables['latent'] == self.latent[i]]]
            S = pd.DataFrame.dot(S.T, S) / S.shape[0]
            numerador = (
                np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights))
            denominador = (
                (np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights)))
            rhoA_ = ((result)**2) * (numerador / denominador)
            if(np.isnan(rhoA_.values)):
                rhoA[self.latent[i]] = 1
            else:
                rhoA[self.latent[i]] = rhoA_.values

        return rhoA.T
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def xloads(self):
        # Xloadings
        A = self.data_.transpose().values
        B = self.fscores.transpose().values
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        xloads_ = (np.dot(A_mA, B_mB.T) /
                   np.sqrt(np.dot(ssA[:, None], ssB[None])))
        xloads = pd.DataFrame(
            xloads_, index=self.manifests, columns=self.latent)

        return xloads
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def alpha(self):
        # Cronbach Alpha
        alpha = pd.DataFrame(0, index=np.arange(1), columns=self.latent)

        for i in range(self.lenlatent):
            block = self.data_[self.Variables['measurement']
                               [self.Variables['latent'] == self.latent[i]]]
            p = len(block.columns)

            if(p != 1):
                p_ = len(block)
                correction = np.sqrt((p_ - 1) / p_)
                soma = np.var(np.sum(block, axis=1))
                cor_ = pd.DataFrame.corr(block)

                denominador = soma * correction**2
                numerador = 2 * np.sum(np.tril(cor_) - np.diag(np.diag(cor_)))

                alpha_ = (numerador / denominador) * (p / (p - 1))
                alpha[self.latent[i]] = alpha_
            else:
                alpha[self.latent[i]] = 1

        return alpha.T
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_ga(self, item):
        output = pd.DataFrame(self.population[item].genes)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(self, item):
        output = pd.DataFrame(self.population[item].position)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_tabu(self, item):
        output = pd.DataFrame(self.population[item])
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)

        cost = (np.sum(f1))
        print(1 / cost)
        return [self.population[item], cost]
项目:rca-evaluation    作者:sieve-microservices    | 项目源码 | 文件源码
def apply(path):
    data = metadata.load(path)
    for service in data["services"]:
        filename = os.path.join(path, service["filename"])
        df = load_timeseries(filename, service)
        print(service)
        df2 = interpolate_missing(df[service["fields"]])
        classes = classify_series(df2)
        preprocessed_series = {}
        for k in classes["other_fields"]:
            # short by one value, because we have to short the other one!
            preprocessed_series[k] = df2[k][1:]
        for k in classes["monotonic_fields"]:
            preprocessed_series[k + "-diff"] = df2[k].diff()[1:]
        newname = service["name"] + "-preprocessed.tsv.gz"
        df3 = pd.DataFrame(preprocessed_series)
        df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip')
        service["preprocessed_filename"] = newname
        service["preprocessed_fields"] = list(df3.columns)
        service.update(classes)
    metadata.save(path, data)
项目:rca-evaluation    作者:sieve-microservices    | 项目源码 | 文件源码
def centroids(path):
    metadata = load_metadata(path)
    d = {}
    for srv in metadata["services"]:
        name = "%s/%s-cluster-1_1.tsv" % (path, srv["name"])
        df = pd.read_csv(name, sep="\t", index_col='time', parse_dates=True)
        d[srv["name"]] = df.centroid
    df2 = pd.DataFrame(d)
    df2 = df2.fillna(method="bfill", limit=1e9)
    df2 = df2.fillna(method="ffill", limit=1e9)
    fig = df2.plot()
    handles, labels = fig.get_legend_handles_labels()
    fig.grid('on')
    lgd = fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1))
    plt.savefig("graph.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close("all")
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def test_pd_outer_join():
    dfs = [
        pd.DataFrame({
            'id': [0, 1, 2, 3],
            'a': ['foo', 'bar', 'baz', np.nan],
            'b': ['panda', 'zebra', np.nan, np.nan],
        }),
        pd.DataFrame({
            'id': [1, 2, 3, 4],
            'b': ['mouse', np.nan, 'tiger', 'egret'],
            'c': ['toe', 'finger', 'nose', np.nan],
        }),
    ]
    expected = pd.DataFrame({
        'id': [0, 1, 2, 3, 4],
        'a': ['foo', 'bar', 'baz', np.nan, np.nan],
        'b': ['panda', 'zebra', np.nan, 'tiger', 'egret'],
        'c': [np.nan, 'toe', 'finger', 'nose', np.nan],
    }).set_index('id')
    actual = pd_outer_join(dfs, on='id')
    print(expected)
    print(actual)
    assert expected.equals(actual)
项目:GeoInfoMiner    作者:jingge326    | 项目源码 | 文件源码
def read_image(imagery_path):
    # Read image
    dataset = gdal.Open(imagery_path)
    dsmatrix = dataset.ReadAsArray(xoff=0, yoff=0, xsize=dataset.RasterXSize, ysize=dataset.RasterYSize)

    # Get Geographic meta data
    geo_trans_list = dataset.GetGeoTransform()
    proj_str = dataset.GetProjection()
    num_bands = dataset.RasterCount

    # Adapt to one bands or multi-bands
    if num_bands > 1:
        # Unfold array into pandas DataFrame
        rows = dsmatrix.shape[1]
        cols = dsmatrix.shape[2]
        data_array = dsmatrix[:,0,:]
        for irow in range(1,rows):
            tempmatirx = dsmatrix[:,irow,:]
            data_array = np.hstack((data_array,tempmatirx))
    else:
        # Unfold array into pandas DataFrame
        rows = dsmatrix.shape[0]
        cols = dsmatrix.shape[1]
        data_array = dsmatrix[0,:]
        for irow in range(1,rows):
            tempmatirx = dsmatrix[irow,:]
            data_array = np.hstack((data_array,tempmatirx))

    data_frame = pd.DataFrame(data_array.T)

    return data_frame, rows, cols, geo_trans_list, proj_str, num_bands
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        '''
        The same arguments as for pandas.DataFrame
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

        data argument should be a list of XSeries objects or dict of XSeries objects.
        In dict is passed, key must be a string and it's indicate appropriate column name.
        For example, to create XDataFrame data should looks like
        data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
        '''
        data = kwargs.get('data')
        if data is None:
            data = args[0]

        data_to_check = []
        if isinstance(data, list):
            data_to_check = data
        elif isinstance(data, dict):
            data_to_check = data.values()

        for d in data_to_check:
            if not isinstance(d, XSeries):
                raise ValueError('All data must be XSeries instances')
        super(XDataFrame, self).__init__(*args, **kwargs)
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def to_pandas_dataframe(self):
        '''
        Convert self to pandas.DataFrame if all columns are primitive types.
        See more at XSeries.to_pandas_series
        :return:
        '''
        data_types = self.get_data_types()
        is_all_columns_are_primitive = all(
            _is_class_a_primitive(dt)
            for dt in data_types
        )
        if is_all_columns_are_primitive:
            self.__class__ = pd.DataFrame
        else:
            raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
        return self
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _create_daily_stats(self, perfs):
        # create daily and cumulative stats dataframe
        daily_perfs = []
        # TODO: the loop here could overwrite expected properties
        # of daily_perf. Could potentially raise or log a
        # warning.
        for perf in perfs:
            if 'daily_perf' in perf:

                perf['daily_perf'].update(
                    perf['daily_perf'].pop('recorded_vars')
                )
                perf['daily_perf'].update(perf['cumulative_risk_metrics'])
                daily_perfs.append(perf['daily_perf'])
            else:
                self.risk_report = perf

        daily_dts = [np.datetime64(perf['period_close'], utc=True)
                     for perf in daily_perfs]
        daily_stats = pd.DataFrame(daily_perfs, index=daily_dts)

        return daily_stats
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _pipeline_output(self, pipeline, chunks):
        """
        Internal implementation of `pipeline_output`.
        """
        today = normalize_date(self.get_datetime())
        try:
            data = self._pipeline_cache.unwrap(today)
        except Expired:
            data, valid_until = self._run_pipeline(
                pipeline, today, next(chunks),
            )
            self._pipeline_cache = CachedObject(data, valid_until)

        # Now that we have a cached result, try to return the data for today.
        try:
            return data.loc[today]
        except KeyError:
            # This happens if no assets passed the pipeline screen on a given
            # day.
            return pd.DataFrame(index=[], columns=data.columns)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def frame_from_bardata(self, data, algo_dt):
        """
        Create a DataFrame from the given BarData and algo dt.
        """
        data = data._data
        frame_data = np.empty((len(self.fields), len(self.sids))) * np.nan

        for j, sid in enumerate(self.sids):
            sid_data = data.get(sid)
            if not sid_data:
                continue
            if algo_dt != sid_data['dt']:
                continue
            for i, field in enumerate(self.fields):
                frame_data[i, j] = sid_data.get(field, np.nan)

        return pd.DataFrame(
            frame_data,
            index=self.fields.copy(),
            columns=self.sids.copy(),
        )
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def update_dividends(self, new_dividends):
        """
        Update our dividend frame with new dividends.  @new_dividends should be
        a DataFrame with columns containing at least the entries in
        zipline.protocol.DIVIDEND_FIELDS.
        """

        # Mark each new dividend with a unique integer id.  This ensures that
        # we can differentiate dividends whose date/sid fields are otherwise
        # identical.
        new_dividends['id'] = np.arange(
            self._dividend_count,
            self._dividend_count + len(new_dividends),
        )
        self._dividend_count += len(new_dividends)

        self.dividend_frame = sort_values(pd.concat(
            [self.dividend_frame, new_dividends]
        ), ['pay_date', 'ex_date']).set_index('id', drop=False)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def create_test_panel_ohlc_source(sim_params, env):
    start = sim_params.first_open \
        if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc)

    end = sim_params.last_close \
        if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc)

    index = env.days_in_range(start, end)
    price = np.arange(0, len(index)) + 100
    high = price * 1.05
    low = price * 0.95
    open_ = price + .1 * (price % 2 - .5)
    volume = np.ones(len(index)) * 1000
    arbitrary = np.ones(len(index))

    df = pd.DataFrame({'price': price,
                       'high': high,
                       'low': low,
                       'open': open_,
                       'volume': volume,
                       'arbitrary': arbitrary},
                      index=index)
    panel = pd.Panel.from_dict({0: df})

    return DataPanelSource(panel), panel
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def add_frame(self, tick, frame, minor_axis=None, items=None):
        """
        """
        if self._pos == self.cap:
            self._roll_data()

        if isinstance(frame, pd.DataFrame):
            minor_axis = frame.columns
            items = frame.index

        if set(minor_axis).difference(set(self.minor_axis)) or \
                set(items).difference(set(self.items)):
            self._update_buffer(frame)

        vals = frame.T.astype(self.dtype)
        self.buffer.loc[:, self._pos, :] = vals
        self.date_buf[self._pos] = tick

        self._pos += 1
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def getindexdaily(self,code,start,end):
        total=[]
        startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
        enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
        series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]}

        for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"):
            series["date"].append(stockdaily["date"])
            series["open"].append(stockdaily["open"])
            series["close"].append(stockdaily["close"])
            series["high"].append(stockdaily["high"])
            series["low"].append(stockdaily["low"])
            series["volume"].append(stockdaily["volume"])

        totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume'])
        df = pd.DataFrame(list(totaldata))
        df.index=df.date
        return df
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def read_treasure_from_mongodb(self,start,end):

        startdate=start
        enddate=end
        series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]}
        if type(start) is types.StringType:
            startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
        if type(end) is types.StringType:
            enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
        for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"):
            series["Time Period"].append(treasuredaily["Time Period"])
            series["1month"].append(treasuredaily["1month"])
            series["3month"].append(treasuredaily["3month"])
            series["6month"].append(treasuredaily["6month"])
            series["1year"].append(treasuredaily["1year"])
            series["2year"].append(treasuredaily["2year"])
            series["3year"].append(treasuredaily["3year"])
            series["5year"].append(treasuredaily["5year"])
            series["7year"].append(treasuredaily["7year"])
            series["10year"].append(treasuredaily["10year"])
            series["20year"].append(treasuredaily["20year"])
            series["30year"].append(treasuredaily["30year"])
        totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"])
        df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year'])
        return df.sort_index().tz_localize('UTC')
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def __init__(self, constants, dates, sids):
        loaders = {}
        for column, const in iteritems(constants):
            frame = DataFrame(
                const,
                index=dates,
                columns=sids,
                dtype=column.dtype,
            )
            loaders[column] = DataFrameLoader(
                column=column,
                baseline=frame,
                adjustments=None,
            )

        self._loaders = loaders
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_consume_metadata(self):

        # Test dict consumption
        dict_to_consume = {0: {'symbol': 'PLAY'},
                           1: {'symbol': 'MSFT'}}
        self.env.write_data(equities_data=dict_to_consume)
        finder = self.asset_finder_type(self.env.engine)

        equity = finder.retrieve_asset(0)
        self.assertIsInstance(equity, Equity)
        self.assertEqual('PLAY', equity.symbol)

        # Test dataframe consumption
        df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
        df['asset_name'][0] = "Dave'N'Busters"
        df['exchange'][0] = "NASDAQ"
        df['asset_name'][1] = "Microsoft"
        df['exchange'][1] = "NYSE"
        self.env = TradingEnvironment(load=noop_load)
        self.env.write_data(equities_df=df)
        finder = self.asset_finder_type(self.env.engine)
        self.assertEqual('NASDAQ', finder.retrieve_asset(0).exchange)
        self.assertEqual('Microsoft', finder.retrieve_asset(1).asset_name)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def setUp(self):
        self.env = TradingEnvironment()
        self.days = self.env.trading_days[:5]
        self.panel = pd.Panel({1: pd.DataFrame({
            'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 0],
            'type': [DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.CLOSE_POSITION]},
            index=self.days)
        })
        self.no_close_panel = pd.Panel({1: pd.DataFrame({
            'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 1e9],
            'type': [DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE,
                     DATASOURCE_TYPE.TRADE]},
            index=self.days)
        })
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_bfill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.bfill().values
        test = bfill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.bfill().values
        test = bfill(df.values)
        assert_almost_equal(correct, test)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_ffill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.ffill().values
        test = ffill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.ffill().values
        test = ffill(df.values)
        assert_almost_equal(correct, test)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def get_expected_next_event_dates(self, dates):
        return pd.DataFrame({
            0: get_values_for_date_ranges(zip_with_dates,
                                          next_dates[0],
                                          next_date_intervals[0],
                                          dates),
            1: get_values_for_date_ranges(zip_with_dates,
                                          next_dates[1],
                                          next_date_intervals[1],
                                          dates),
            2: get_values_for_date_ranges(zip_with_dates,
                                          next_dates[2],
                                          next_date_intervals[2],
                                          dates),
            3: get_values_for_date_ranges(zip_with_dates,
                                          next_dates[3],
                                          next_date_intervals[3],
                                          dates),
            4: zip_with_dates(dates, ['NaT'] * len(dates)),
        }, index=dates)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def get_expected_previous_event_dates(self, dates):
        return pd.DataFrame({
            0: get_values_for_date_ranges(zip_with_dates,
                                          prev_dates[0],
                                          prev_date_intervals[0],
                                          dates),
            1: get_values_for_date_ranges(zip_with_dates,
                                          prev_dates[1],
                                          prev_date_intervals[1],
                                          dates),
            2: get_values_for_date_ranges(zip_with_dates,
                                          prev_dates[2],
                                          prev_date_intervals[2],
                                          dates),
            3: get_values_for_date_ranges(zip_with_dates,
                                          prev_dates[3],
                                          prev_date_intervals[3],
                                          dates),
            4: zip_with_dates(dates, ['NaT'] * len(dates)),
        }, index=dates)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def get_vals_for_dates(zip_date_index_with_vals,
                       vals,
                       date_invervals,
                       dates):
    return pd.DataFrame({
        0: get_values_for_date_ranges(zip_date_index_with_vals,
                                      vals[0],
                                      date_invervals[0],
                                      dates),
        1: get_values_for_date_ranges(zip_date_index_with_vals,
                                      vals[1],
                                      date_invervals[1],
                                      dates),
        2: get_values_for_date_ranges(zip_date_index_with_vals,
                                      vals[2],
                                      date_invervals[2],
                                      dates),
        # Assume the latest of 2 cash values is used if we find out about 2
        # announcements that happened on the same day for the same sid.
        3: get_values_for_date_ranges(zip_date_index_with_vals,
                                      vals[3],
                                      date_invervals[3],
                                      dates),
        4: zip_date_index_with_vals(dates, ['NaN'] * len(dates)),
    }, index=dates)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_auto_deltas(self):
        expr = bz.data(
            {'ds': self.df,
             'ds_deltas': pd.DataFrame(columns=self.df.columns)},
            dshape=var * Record((
                ('ds', self.dshape.measure),
                ('ds_deltas', self.dshape.measure),
            )),
        )
        loader = BlazeLoader()
        ds = from_blaze(
            expr.ds,
            loader=loader,
            missing_values=self.missing_values,
        )
        self.assertEqual(len(loader), 1)
        exprdata = loader[ds]
        self.assertTrue(exprdata.expr.isidentical(expr.ds))
        self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeCashBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                CASH_FIELD_NAME:
                    frame[CASH_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def file_get_iem_data_frame(path):
    """
    Return the IEM samplesheet data as a Pandas DataFrame,
    to perform better slicing operations.
    """
    rows = read_csv_rows(path)
    if not rows_are_iem_samplesheet(rows):
        raise ValueError("Invalid IEM samplesheet format: %s" % path)
    section_gen = rows_iem_section_generator(rows)
    for section in section_gen:
        if section_is_valid_data(section):
            # TODO this appears to be a problem if you have data columns
            # with trailing all-blank entries (see CSI-215 fix)
            df = pd.DataFrame(data=section.rows[1:], columns=section.rows[0])
            # skip tailing rows
            return df[df['Sample_ID'].notnull()]
    raise ValueError("Invalid IEM samplesheet format, no data found: %s" % path)
项目:dcan-tensorflow    作者:lisjin    | 项目源码 | 文件源码
def gen_csv_paths(data_dir, pref):
    """
    Generate CSV file from image, contour, and segment file paths.
    Args:
        data_dir: BBBC006 data directory path.
        pref: Prefix (either 'train' or 'test')
    """
    filenames = get_png_files(os.path.join(data_dir, 'BBBC006_v1_' + pref))
    contours = get_png_files(os.path.join(data_dir, 'BBBC006_v1_contours_'
                                                     + pref))
    segments = get_png_files(os.path.join(data_dir, 'BBBC006_v1_segments_'
                                                     + pref))

    all_files = [filenames, contours, segments]
    pd_arr = pd.DataFrame(all_files).transpose()
    pd_arr.to_csv(pref + '.csv', index=False, header=False)
项目:bitrader    作者:jr-minnaar    | 项目源码 | 文件源码
def kraken_order_book(book_type: str, currency_code: str = 'EUR', coin_code: str = 'XBT'):
    """Kraken specific orderbook retrieval

    """
    import krakenex

    kraken_api = krakenex.API(key=KRAKEN_API_KEY, secret=KRAKEN_PRIVATE_KEY, conn=krakenex.Connection())

    pair = f'X{coin_code}Z{currency_code}'
    orders = kraken_api.query_public('Depth', {'pair': pair})

    df = pd.DataFrame(
        orders['result'][pair][book_type],
        columns=['price', 'volume', 'timestamp'])

    return df
项目:recom-system    作者:tizot    | 项目源码 | 文件源码
def get_irrelevant_cited_papers(bad_papers, db_cursor, papers_table='papers'):
    """Retrieves the papers cited by the irrelevant papers given in input, from a SQL database.

    Args:
        bad_papers (list of dicts): the list of irrelevant papers, formatted as the output of :func:`data_retrieval.list2paper`
        db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table
        papers_table (string): name of the papers table in the SQL database

    Returns:
        tuple of tuples: the results of the SQL query
    """
    citations = []
    for p in bad_papers:
        for c in p['citations']:
            citations.append([p['index'], c])

    citations_df = pd.DataFrame(citations, columns=['citing', 'cited'])
    cited = citations_df['cited'].unique()

    db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited))

    return db_cursor.fetchall()
项目:BioNanoAnalyst    作者:AppliedBioinformatics    | 项目源码 | 文件源码
def parse_fasta(self):
        self.ref_id=dict()
        self.ref_inf=dict()
        i=1
        N = 0
        ref_inf=np.empty(shape=[0,3])
        for seqs in SeqIO.parse(self.ref,'fasta'):
            seq_id = seqs.id
            self.ref_id[i] = seq_id
            seq = str(seqs.seq.upper())
            seq_len = len(seq)
            self.ref_inf[seq_id]=seq_len
            N+=seq.count('N')
            ref_inf = np.append(ref_inf,[[i,seq_id,seq_len]],axis=0)
            i+=1
        self.ref_detail = pd.DataFrame(ref_inf,columns=['Index','Contig','Length(bp)'])
        self.N = N
项目:BioNanoAnalyst    作者:AppliedBioinformatics    | 项目源码 | 文件源码
def qualification_filter(self):
        """
        Providing information of those unqualified and qualified contigs from the orginal fasta file
        with the criterion: >20Kb & >=5 restriction sites inside.
        """
        unqualified = np.empty(shape=[0,3])
        qualified = np.empty(shape=[0,4])
        rm_dup = self.RcmapTable[['CMapId','ContigLength','NumSites']].drop_duplicates()
        for i in self.ref_id.keys():
            index = i
            name = self.ref_id[i]
            length = self.ref_inf[name]
            if i not in self.RcmapTable['CMapId'].unique():
                unqualified = np.append(unqualified,[[index,name, length]],axis=0)
            else:
                Id = rm_dup[rm_dup['CMapId']==i].index[0]
                sites = rm_dup['NumSites'][Id]
                qualified = np.append(qualified,[[index,name,length,sites]],axis=0)
        self.unqualified = pd.DataFrame(unqualified, columns=['index','contig','length(bp)'])
        self.qualified = pd.DataFrame(qualified, columns=['index','contig','length(bp)','numSites'])
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    # Combine with .append
    head = frags[0]
    tail = frags[1:]

    appended = dgd.from_pygdf(head, npartitions=1)
    for each in tail:
        appended = appended.append(each)

    assert_frame_equal(df, appended.compute().to_pandas())
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_series_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    appending = dgd.from_pygdf(frags[0], npartitions=1)
    for frag in frags[1:]:
        appending = appending.append(frag)

    appended = appending.compute().to_pandas()
    assert isinstance(appended, pd.Series)
    np.testing.assert_array_equal(appended, df.x)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_set_index(nelem):
    np.random.seed(0)
    # Use unique index range as the sort may not be stable-ordering
    x = np.arange(nelem)
    np.random.shuffle(x)
    df = pd.DataFrame({'x': x,
                       'y': np.random.randint(0, nelem, size=nelem)})
    ddf = dd.from_pandas(df, npartitions=2)
    dgdf = dgd.from_dask_dataframe(ddf)

    expect = ddf.set_index('x').compute()
    got = dgdf.set_index('x').compute().to_pandas()

    np.testing.assert_array_equal(got.index.values, expect.index.values)
    np.testing.assert_array_equal(got.y.values, expect.y.values)
    assert got.columns == expect.columns
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_groupby_single_key(keygen):
    np.random.seed(0)

    nelem = 500
    npartitions = 10

    # Generate the keys
    xs = keygen(nelem)

    assert xs.size == nelem
    df = pd.DataFrame({'x': xs,
                       'z': np.random.normal(size=nelem) + 1})
    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_pygdf(gdf, npartitions=npartitions)

    groups = dgf.groupby(by=['x']).count()
    got = groups.compute().to_pandas()

    # Check against expectation
    expect = df.groupby(by=['x'], as_index=False).count()
    # Check keys
    np.testing.assert_array_equal(got.x, expect.x)
    # Check values
    np.testing.assert_array_equal(got.z, expect.z)
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def store_test_predictions(self, prediction_id='_final'):
        """
        Stores the test predictions in a CSV file

        :param prediction_id: A simple id appended to the name of the summary for uniqueness
        :return: None
        """
        # prediction id is usually the step count
        print 'Storing predictions on Test Data...'
        review = []
        true_summary = []
        generated_summary = []
        for i in range(self.test_size):
            if not self.checkpointer.is_output_file_present():
                review.append(self._index2sentence(self.test_review[i]))
                true_summary.append(self._index2sentence(self.true_summary[i]))
            if i < (self.test_batch_size * (self.test_size // self.test_batch_size)):
                generated_summary.append(self._index2sentence(self.predicted_test_summary[i]))
            else:
                generated_summary.append('')

        prediction_nm = 'generated_summary' + prediction_id
        if self.checkpointer.is_output_file_present():
            df = pd.read_csv(self.checkpointer.get_result_location(), header=0)
            df[prediction_nm] = np.array(generated_summary)
        else:
            df = pd.DataFrame()
            df['review'] = np.array(review)
            df['true_summary'] = np.array(true_summary)
            df[prediction_nm] = np.array(generated_summary)
        df.to_csv(self.checkpointer.get_result_location(), index=False)
        print 'Stored the predictions. Moving Forward'
        if prediction_id == '_final':
            print 'All done. Exiting..'
            print 'Exited'
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def crawl_for_reviews_and_summary(self, input_file):
        """
        Crawl the input dataset

        :param input_file: The location of the file containing the txt file dataset
        :return: None
        """
        self.raw_data_file = input_file
        self.df = pd.DataFrame()
        self.df['Review'] = self.__crawl_review()
        self.df['Summary'] = self.__crawl_summary()
项目:numerai    作者:gansanay    | 项目源码 | 文件源码
def pearson(X, y):
    r = []
    p = []
    for c in X.columns:
        r_, p_ = pearsonr(X[c], y)
        r.append(r_)
        p.append(p_)
    dfr = pd.DataFrame(index=range(1, 1+len(X.columns)))
    dfr['pearson'] = r
    dfr['pearson_p'] = p
    return dfr
项目:numerai    作者:gansanay    | 项目源码 | 文件源码
def kolmogorov_smirnov(x_train, x_test):
    r = []
    p = []
    for c in x_train.columns:
        r_, p_ = ks_2samp(x_train[c], x_test[c])
        r.append(r_)
        p.append(p_)
    dfks = pd.DataFrame(index=range(1, 1 + len(x_train.columns)))
    dfks['KS'] = r
    dfks['KS_p'] = p
    return dfks