Python pandas 模块,concat() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.concat()

项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
#    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].genes)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    return (1 / np.sum(f1))

# Main
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_ga(self, item):
        output = pd.DataFrame(self.population[item].genes)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def do_work_pso(self, item):
        output = pd.DataFrame(self.population[item].position)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def merged(self, s, t):
        chars = []
        for c1, c2 in zip_longest(s.sequence, t.sequence):
            if c1 is None:
                c = c2
            elif c2 is None:
                c = c1
            elif c1 == 'N':
                c = c2
            elif c2 == 'N':
                c = c1
            elif c1 != c2:
                return None
            else:
                assert c1 == c2
                c = c1
            chars.append(c)
        seq = ''.join(chars)
        requested = s.requested or t.requested
        name = s.name + ';' + t.name
        # take union of groups
        group = pd.concat([s.group, t.group]).groupby(level=0).last()
        return SiblingInfo(seq, requested, name, group)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def update_dividends(self, new_dividends):
        """
        Update our dividend frame with new dividends.  @new_dividends should be
        a DataFrame with columns containing at least the entries in
        zipline.protocol.DIVIDEND_FIELDS.
        """

        # Mark each new dividend with a unique integer id.  This ensures that
        # we can differentiate dividends whose date/sid fields are otherwise
        # identical.
        new_dividends['id'] = np.arange(
            self._dividend_count,
            self._dividend_count + len(new_dividends),
        )
        self._dividend_count += len(new_dividends)

        self.dividend_frame = sort_values(pd.concat(
            [self.dividend_frame, new_dividends]
        ), ['pay_date', 'ex_date']).set_index('id', drop=False)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeCashBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                CASH_FIELD_NAME:
                    frame[CASH_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeShareBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                SHARE_COUNT_FIELD_NAME:
                    frame[SHARE_COUNT_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)
项目:table-compositor    作者:InvestmentSystems    | 项目源码 | 文件源码
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
项目:table-compositor    作者:InvestmentSystems    | 项目源码 | 文件源码
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
项目:soccerstan    作者:Torvaney    | 项目源码 | 文件源码
def read_data(fname):
    """ Read football-data.co.uk csv """
    data = (
        pd.read_csv(fname)
        .rename(columns={
                'HomeTeam': 'home_team',
                'AwayTeam': 'away_team',
                'FTHG': 'home_goals',
                'FTAG': 'away_goals'
            })
        .loc[lambda df: ~pd.isnull(df['home_goals'])]  # Remove future games
    )

    team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
    data['home_team_id'] = data['home_team'].replace(team_map)
    data['away_team_id'] = data['away_team'].replace(team_map)


    for col in ('home_goals', 'away_goals'):
        data[col] = [int(c) for c in data[col]]

    return data, team_map
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709):
    api = TdxHq_API()
    with api.connect(ip, port):
        data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code(
            code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0)
        data = data\
            .assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\
            .drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\
            .assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\
            .assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\
            .assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens)
        if data is not None:
            return data
        else:
            return None
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709):
    '????'
    api = TdxHq_API()
    with api.connect(ip, port):

        data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'),
                          api.to_df(api.get_and_parse_block_info(
                              "block.dat")).assign(type='yb'),
                          api.to_df(api.get_and_parse_block_info(
                              "block_zs.dat")).assign(type='zs'),
                          api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')])

        if len(data) > 10:
            return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates()
        else:
            QA_util_log_info('Wrong with fetch block ')
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727):
    '???? ??'

    apix = TdxExHq_API()
    start_date = str(start_date)[0:10]
    today_ = datetime.date.today()
    lens = QA_util_get_trade_gap(start_date, today_)
    global extension_market_info
    extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info

    with apix.connect(ip, port):
        code_market = extension_market_info.query('code=="{}"'.format(code))

        data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type(
            level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0)
        data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False)

        return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_data_make_qfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')
    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1)[::-1].cumprod()
    data['open'] = data['open'] * data['adj']
    data['high'] = data['high'] * data['adj']
    data['low'] = data['low'] * data['adj']
    data['close'] = data['close'] * data['adj']
    data['preclose'] = data['preclose'] * data['adj']

    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu',
                                           'if_trade', 'category'], axis=1).query("open != 0")
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_data_make_hfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')

    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1).cumprod()
    data['open'] = data['open'] / data['adj']
    data['high'] = data['high'] / data['adj']
    data['low'] = data['low'] / data['adj']
    data['close'] = data['close'] / data['adj']
    data['preclose'] = data['preclose'] / data['adj']
    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def get_text_len(DB, tr, te):
    if tr is None:
        if te=='stage1':
            Data = [DB.data['training_text'],DB.data['test_text_filter']]
        else:
            Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']]
    else:
        Data = [DB.data['training_text']]
    for data in Data:
        data['tl'] = data['Text'].apply(lambda x:len(x))
        data['tl2'] = data['Text'].apply(lambda x:len(x.split()))
    if tr is None:
        X,Xt = Data
        return X[['tl','tl2']].values, Xt[['tl','tl2']].values
    else:
        X = Data[0][['tl','tl2']].values
        return X[tr],X[te]
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def get_pattern(DB,tr,te,patterns):
    cols = ['p%d'%c for c,p in enumerate(patterns)]
    if tr is None:
        test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants']
        if te=='stage1':
            train = DB.data['training_variants']
        else:
            train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0)
        Data =[train,test]
    else:
        Data = [DB.data['training_variants']]

    for data in Data:
        for c,p in enumerate(patterns):
            data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower())))

    if tr is None:
        return train[cols].values,test[cols].values
    else:
        X = data[cols].values
        return X[tr],X[te]
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def onehot_gene(DB, tr, te):
    from utils.np_utils.encoder import onehot_encode
    if tr is None:
        train = DB.data['training_variants']
        if te=="stage1":
            test = DB.data['test_variants_filter']
        else:
            train = pd.concat([train,DB.data['test_variants_filter']],axis=0)
            test = DB.data['stage2_test_variants']
        lbl_encode(train,test)
        n = max(train['Gene'].max(),test['Gene'].max())
        gtr = onehot_encode(train['Gene'].values,n=n+1)
        gte = onehot_encode(test['Gene'].values)
        return gtr,gte
    else:
        data = DB.data['training_variants']
        lbl_encode(data,cols=['Gene'])
        gene = data['Gene'].values
        gene = onehot_encode(gene)
        return gene[tr],gene[te]
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def post_cv(flags):
    import re
    import os
    path = flags.data_path
    files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
    s = []
    for name in files:
        s.append(pd.read_csv("%s/%s"%(path,name)))

    s = pd.concat(s,axis=0)
    print(s.head())
    classes = len([i for i in s.columns.values if 'class' in i])
    from utils.np_utils.utils import cross_entropy
    yp = s[['class%d'%i for i in range(1,classes+1)]].values
    y=s['real'].values
    print(cross_entropy(y,yp))
    s.to_csv("%s/cv.csv"%path,index=False)
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def OHETr(self, tr):
        """"""
        OHEDict = {}
        for col in tr.columns:
            ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values]
            ValueCounts.append('missing')
            SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0))
            OHTr = self.__ApplyOH(tr[col].values, SelectedValues)

            headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues)
            tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])]
            OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp)

            tr = pd.concat([tr, OHDFTr], axis=1)

            tr.drop(col, axis=1, inplace=True)
            OHEDict[col] = SelectedValues
            #print('Column %s was encoded.' % col)

        return tr, OHEDict
项目:PersonalizedMultitaskLearning    作者:mitmedialab    | 项目源码 | 文件源码
def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
    df = None
    for filename in filenames:
        fdf = pd.DataFrame.from_csv(file_path + filename)

        if reset_index:
            fdf = fdf.reset_index()

        if df is None:
            df = fdf.copy(deep=True)
        else:
            df = pd.concat([df,fdf])

    if drop_cols is not None:
        for feat in drop_cols:
            df = df.drop(feat, 1)

    return df
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def tsne_cluster_cuisine(df,sublist):
    lenlist=[0]
    df_sub = df[df['cuisine']==sublist[0]]
    lenlist.append(df_sub.shape[0])
    for cuisine in sublist[1:]:
        temp = df[df['cuisine']==cuisine]
        df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
        lenlist.append(df_sub.shape[0])
    df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
    print df_X.shape, lenlist

    dist = squareform(pdist(df_X, metric='cosine'))
    tsne = TSNE(metric='precomputed').fit_transform(dist)

    palette = sns.color_palette("hls", len(sublist))
    plt.figure(figsize=(10,10))
    for i,cuisine in enumerate(sublist):
        plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
        tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
    plt.legend()

#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def create_future(fold, features_old, cfg_parameters):
    """
    Just for testing purposes.
    Sets up a replicate of the last day(s) data to create new data for testing. But in reality,
    we should be able to create features for the upcoming days from past data, so this would not be needed???
    """
    last_day = fold['window_end']
    next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))]
    old_features_unique = features_old.drop_duplicates(subset='ToiletID')
    l_future_features = []
    for day in  next_days:
        next_day_features = old_features_unique.copy()
        next_day_features["Collection_Date"] = day
        l_future_features.append(next_day_features)
    future_features = pd.concat(l_future_features, ignore_index=True)
    return(future_features)
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        """
        Store the configuration of link DfConcatenator

        :param str name: name of link
        :param str storeKey: key of data to store in data store
        :param list readKeys: keys of pandas dataframes in the data store
        :param bool ignore_missing_input: Skip missing input datasets. If all missing, store empty dataset. Default is false.
        :param kwargs: all other key word arguments are passed on to pandas concat function.
        """

        Link.__init__(self, kwargs.pop('name', 'DfConcatenator'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs, readKeys=[])
        self._process_kwargs(kwargs, storeKey=None)
        self._process_kwargs(kwargs, ignore_missing_input=False)

        # pass on remaining kwargs to pandas reader 
        self.kwargs = copy.deepcopy(kwargs)

        return
项目:loving-ai    作者:opencog    | 项目源码 | 文件源码
def collect_history_data(history_dir, days):
    today = dt.datetime.now()
    dfs = []
    for d in glob.glob('{}/*'.format(history_dir)):
        if os.path.isdir(d):
            dirname = os.path.basename(d)
            dirdate = None
            try:
                dirdate = dt.datetime.strptime(dirname, '%Y%m%d')
            except Exception as ex:
                logger.error(ex)
            if dirdate and (days == -1 or (today - dirdate).days < days):
                for fname in glob.glob('{}/{}/*.csv'.format(history_dir, dirname)):
                    try:
                        dfs.append(pd.read_csv(fname))
                    except Exception as ex:
                        logger.warn("Reading {} error: {}".format(fname, ex))
    if not dfs:
        return None
    df = pd.concat(dfs, ignore_index=True)
    df = df[df.Datetime != 'Datetime'].sort(
        ['User', 'Datetime']).drop_duplicates()
    return df
项目:empyrical    作者:quantopian    | 项目源码 | 文件源码
def _aligned_series(*many_series):
    """
    Return a new list of series containing the data in the input series, but
    with their indices aligned. NaNs will be filled in for missing values.

    Parameters
    ----------
    many_series : list[pd.Series]

    Returns
    -------
    aligned_series : list[pd.Series]

        A new list of series containing the data in the input series, but
        with their indices aligned. NaNs will be filled in for missing values.

    """
    return [series
            for col, series in iteritems(pd.concat(many_series, axis=1))]
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def compute_panel(cls, data, scales, **params):
        func = make_summary_fun(params['fun_data'], params['fun_y'],
                                params['fun_ymin'], params['fun_ymax'],
                                params['fun_args'])

        # break a dataframe into pieces, summarise each piece,
        # and join the pieces back together, retaining original
        # columns unaffected by the summary.
        summaries = []
        for (group, x), df in data.groupby(['group', 'x']):
            summary = func(df)
            summary['x'] = x
            summary['group'] = group
            unique = uniquecols(df)
            if 'y' in unique:
                unique = unique.drop('y', axis=1)
            merged = summary.merge(unique, on=['group', 'x'])
            summaries.append(merged)

        new_data = pd.concat(summaries, axis=0, ignore_index=True)
        return new_data
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def compute_panel(cls, data, scales, params):
        if not params['var']:
            return data

        negative = data['ymax'] < 0
        neg = data.loc[negative]
        pos = data.loc[~negative]
        neg.is_copy = None
        pos.is_copy = None

        if len(neg):
            neg = cls.collide(neg, params=params)

        if len(pos):
            pos = cls.collide(pos, params=params)

        data = pd.concat([neg, pos], axis=0, ignore_index=True)
        return data
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def add_missing_facets(data, layout, vars, facet_vals):
    # When in a dataframe some layer does not have all
    # the facet variables, add the missing facet variables
    # and create new data where the points(duplicates) are
    # present in all the facets
    missing_facets = set(vars) - set(facet_vals)
    if missing_facets:
        to_add = layout.loc[:, missing_facets].drop_duplicates()
        to_add.reset_index(drop=True, inplace=True)

        # a point for each facet, [0, 1, ..., n-1, 0, 1, ..., n-1, ...]
        data_rep = np.tile(np.arange(len(data)), len(to_add))
        # a facet for each point, [0, 0, 0, 1, 1, 1, ... n-1, n-1, n-1]
        facet_rep = np.repeat(np.arange(len(to_add)), len(data))

        data = data.iloc[data_rep, :].reset_index(drop=True)
        facet_vals = facet_vals.iloc[data_rep, :].reset_index(drop=True)
        to_add = to_add.iloc[facet_rep, :].reset_index(drop=True)
        facet_vals = pd.concat([facet_vals, to_add],
                               axis=1, ignore_index=False)

    return data, facet_vals
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def disp_gap_byweather(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            weather_dict = self.get_weather_dict(data_dir)

            temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict)     
            dumpload.dump(temp_df)

        df = pd.concat([df, temp_df],  axis=1)

        gaps_mean = df.groupby('preweather')['gap'].mean()
        gaps_mean.plot(kind='bar')
        plt.ylabel('Mean of gap')
        plt.xlabel('Weather')
        plt.title('Weather/Gap Correlation')
        return
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def disp_gap_bytraffic(self):
        df = self.gapdf
        data_dir = g_singletonDataFilePath.getTrainDir()
        dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle'
        dumpload = DumpLoad(dumpfile_path)
        if dumpload.isExisiting():
            temp_df = dumpload.load()
        else:
            traffic_dict = self.get_traffic_dict(data_dir)

            temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3)   
            dumpload.dump(temp_df)

        df = pd.concat([df, temp_df],  axis=1)


        by_traffic = df.groupby('traffic1')
        x=[]
        y=[]
        for name, group in by_traffic:
            x.append(name)
            y.append(group['gap'].mean())
        plt.scatter(x,y)

        return
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def __do_one_hot_encodings(self):
        df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()]
        df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()]
        df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()]
        enc = OneHotEncoder(sparse=False)
        cross_feature_dict = self.__get_label_encode_dict()
        to_be_encoded = []
        for _, new_feature_name in cross_feature_dict.iteritems():
            to_be_encoded.append(new_feature_name)
        #fix all data source
        to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0)
        enc.fit(to_be_stacked_df)

        enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2)
        # transform on seprate data source
        self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv
        self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded)
        self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded)
        return
项目:finance-ml    作者:Omarkhursheed    | 项目源码 | 文件源码
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    agg = concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def predict_tf_all(path = None):
    result_list = []
    p = m_Pool(31)
    result_list = p.map(predict_tf_once,range(1,32))
    p.close()
    p.join()
    print 'writing...'
    result_df = pd.DataFrame(index = range(1))
    for day,result in result_list:
        day_s = str(day)
        if len(day_s)<=1:
            day_s = '0'+day_s
        result_df['201610'+day_s] = result
    result_df = result_df.T
    result_df.columns = ['predict_power_consumption']
    if path == None:
        date = str(pd.Timestamp(time.ctime())).replace(' ','_').replace(':','_')
        path = './result/'+date+'.csv'
    result_df.to_csv(path,index_label='predict_date')

    l = map(lambda day:pd.DataFrame.from_csv('./result/predict_part/%d.csv'%day),range(1,32))
    t = pd.concat(l)
    t.to_csv('./result/predict_part/'+date+'.csv')
项目:toll_road    作者:idosekely    | 项目源码 | 文件源码
def rolling_mean(self, window=10):
        means = self.df.rolling(window=window).mean()
        ewm_means = self.df.ewm(halflife=window).mean()
        means.columns = ['mean-%s' % col for col in means.columns]
        ewm_means.columns = ['ewm-%s' % col for col in ewm_means.columns]
        ts = pd.concat([means, ewm_means], axis=1)
        return ts
项目:toll_road    作者:idosekely    | 项目源码 | 文件源码
def filter(self, lamb=1e5):
        cycle, trend = sm.tsa.filters.hpfilter(self.df, lamb=lamb)
        trend.columns = ['%s-trend' % col for col in trend.columns]
        # cycle.columns = ['%s-cycle' % col for col in cycle.columns]
        # ts = pd.concat([cycle, trend], axis=1)
        # return ts
        return trend
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def HOCcat(data_, mvmodel, seed):
    response = data_.ix[:, 10:25]
    preditors = []
    preditors.append(data_.ix[:, 10:15])
    preditors.append(data_.ix[:, 15:20])
    preditors.append(data_.ix[:, 20:25])

    plsr_ = None
    for i in range(3):
        res_ = plsr2(preditors[i], response, seed=seed)[0]
        plsr_ = res_ if plsr_ is None else np.hstack((plsr_, res_))

    plsr_ = pd.DataFrame(plsr_)
    plsr_.index = range(len(plsr_))

    cols = list(plsr_.columns)
    for s in range(len(cols)):
        cols[cols.index(s)] = 'T' + str(s)
    plsr_.columns = cols

    data_ = pd.concat([data_, plsr_], axis=1)

    Variables = pd.read_csv(mvmodel)
    Variables = Variables[
        Variables.latent.str.contains("Humanização") == False]

    for i in range(len(cols)):
        df_ = pd.DataFrame([['Humanização', cols[i], 'A']],
                           columns=Variables.columns)
        Variables = Variables.append(df_)

    Variables.index = range(len(Variables))
    mvmodel = Variables

    return[data_, mvmodel]
项目:rca-evaluation    作者:sieve-microservices    | 项目源码 | 文件源码
def _compare_services(srv_a, srv_b, path, prev_cluster_metadata):
    df_a = read_service(srv_a, path, prev_cluster_metadata)
    df_b = read_service(srv_b, path, prev_cluster_metadata)
    p_values = defaultdict(list)
    df = pd.concat([df_a, df_b]).resample("500ms").mean()
    df.interpolate(method="time", limit_direction="both", inplace=True)
    df.fillna(method="bfill", inplace=True)

    for c1, c2 in combine(df_a.columns, df_b.columns):
        if c1 == c2:
            continue
        grangercausality(df[[c1, c2]], p_values, 5)
        grangercausality(df[[c2, c1]], p_values, 5)
    return pd.DataFrame(p_values)
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def apply(self, *args, **kwargs):
        '''
        Overwrite standart pandas.Series method.
        Apply transform function to all elements in self.
        *If transform function return dict like object,
        transform XSeries to XDataFrame see XDataFrame constructor*

        :param func: function to apply
        :param prefix: prefix for columns if needs to return XDataFrame object
        :return: XSeries of XDataFrame depending on transformation
        '''
        func = kwargs.get('func')
        if func is None:
            func = args[0]

        # TODO
        # Possibly change to handle NaN
        mapped_series = self.dropna()
        mapped_series = mapped_series.map(func, na_action='ignore')
        mapped_data_type = mapped_series.data_type

        custom_prefix = kwargs.get('prefix')
        if custom_prefix is None:
            custom_prefix = self.name
        else:
            custom_prefix = '{}_{}'.format(self.name, custom_prefix)

        if mapped_series.__is_data_type_dict_like():
            custom_df = XDataFrame.from_records(mapped_series.values)

            if custom_prefix is not None:
                custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x))
            return custom_df
        elif mapped_data_type == pd.DataFrame:
            return pd.concat(mapped_series.values, ignore_index=True)
        else:
            mapped_series.name = custom_prefix

        return mapped_series
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def concat_dataframes(cls, data_frames):
        '''
        Concatenate XDataFrame using pandas.concat method
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
        over columns
        :param data_frames: list of XDataFrame instances
        :return: XDataFrame — concatenated list of data_frames
        '''
        return pd.concat(data_frames, axis=1)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def earn_dividends(self, dividend_frame):
        """
        Given a frame of dividends whose ex_dates are all the next trading day,
        calculate and store the cash and/or stock payments to be paid on each
        dividend's pay date.
        """
        earned = dividend_frame.apply(self._maybe_earn_dividend, axis=1)\
                               .dropna(how='all')
        if len(earned) > 0:
            # Store the earned dividends so that they can be paid on the
            # dividends' pay_dates.
            self._unpaid_dividends = pd.concat(
                [self._unpaid_dividends, earned],
            )
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def load_prices_from_csv_folder(folderpath, identifier_col, tz='UTC'):
    data = None
    for file in os.listdir(folderpath):
        if '.csv' not in file:
            continue
        raw = load_prices_from_csv(os.path.join(folderpath, file),
                                   identifier_col, tz)
        if data is None:
            data = raw
        else:
            data = pd.concat([data, raw], axis=1)
    return data
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def get_data(self):

        in_package_data = range(2002, 2017)
        cur_year = datetime.datetime.now().year
        last_in_package_data = max(in_package_data)


        # download new data
        to_downloads = range(last_in_package_data + 1, cur_year + 1)

        # frist, get ycDefIds params
        response = requests.get(self.YIELD_MAIN_URL)

        matchs = re.search(r'\?ycDefIds=(.*?)\&', response.text)
        ycdefids = matchs.group(1)
        assert (ycdefids is not None)

        fetched_data = []
        for year in to_downloads:
            print('Downloading from ' + self.DONWLOAD_URL % (year, ycdefids))
            response = requests.get(self.DONWLOAD_URL % (year, ycdefids))
            fetched_data.append(BytesIO(response.content))

        # combine all data

        dfs = []

        basedir = os.path.join(os.path.dirname(__file__), "xlsx")

        for i in in_package_data:
            dfs.append(pd.read_excel(os.path.join(basedir, "%d.xlsx" % i)))

        for memfile in fetched_data:
            dfs.append(pd.read_excel(memfile))

        df = pd.concat(dfs)

        return df