Python pandas 模块,read_hdf() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_hdf()

项目:dataset    作者:analysiscenter    | 项目源码 | 文件源码
def _load_table(self, src, fmt, components=None, *args, **kwargs):
        """ Load a data frame from table formats: csv, hdf5, feather """
        if fmt == 'csv':
            _data = pd.read_csv(src, *args, **kwargs)
        elif fmt == 'feather':
            _data = feather.read_dataframe(src, *args, **kwargs)  # pylint: disable=redefined-variable-type
        elif fmt == 'hdf5':
            _data = pd.read_hdf(src, *args, **kwargs)         # pylint: disable=redefined-variable-type

        # Put into this batch only part of it (defined by index)
        if isinstance(_data, pd.DataFrame):
            _data = _data.loc[self.indices]
        elif isinstance(_data, dd.DataFrame):
            # dask.DataFrame.loc supports advanced indexing only with lists
            _data = _data.loc[list(self.indices)].compute()

        components = tuple(components or self.components)
        for i, comp in enumerate(components):
            setattr(self, comp, _data.iloc[:, i].values)
项目:interactive_mpl_tutorial    作者:tacaswell    | 项目源码 | 文件源码
def load_data(dataset):
    """Load data from a given dataset

    Parameters
    ----------
    dataset : str
       Searches from dataset.h5 in this file's directory

    Returns
    -------
    DataFrame
       Hourly temperature data
    """
    p = Path(os.path.dirname(os.path.realpath(__file__))) / 'data'
    fname = p / f'{dataset}.h5'

    try:
        return pd.read_hdf(str(fname))
    except FileNotFoundError:
        sources = {f.stem for f in p.iterdir() if
                   f.is_file() and f.name.endswith('h5')}
        raise RuntimeError(f"Could not not find {dataset!r}.  Existing "
                           f"datasets are {sources}")
项目:PyTrader    作者:didw    | 项目源码 | 文件源码
def get_data_opt10081(self, code, date='20161231'):
        try:
            data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
            start = str(data.index[-2])
        except (FileNotFoundError, IndexError)  as e:
            start = "20010101"
        print("get 81 data from %s" % start)
        self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d")
        self.kiwoom.data_opt10081 = [] * 15
        self.kiwoom.set_input_value("????", code)
        self.kiwoom.set_input_value("????", date)
        self.kiwoom.set_input_value("??????", 255)
        self.kiwoom.comm_rq_data("??????????", "opt10081", 0, "0101")
        while self.kiwoom.inquiry == '2':
            time.sleep(TR_REQ_TIME_INTERVAL)
            self.kiwoom.set_input_value("????", code)
            self.kiwoom.set_input_value("????", date)
            self.kiwoom.set_input_value("??????", 255)
            self.kiwoom.comm_rq_data("??????????", "opt10081", 2, "0101")
        self.kiwoom.data_opt10081.index = self.kiwoom.data_opt10081.loc[:, '??']
        return self.kiwoom.data_opt10081.loc[:, ['???', '???', '????', '??', '??', '??']]
项目:PyTrader    作者:didw    | 项目源码 | 文件源码
def get_data_opt10086(self, code, date):
        try:
            data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
            start = str(data.index[-2])
        except (FileNotFoundError, IndexError) as e:
            start = "20010101"
        print("get 86 data from %s" % start)
        self.kiwoom.start_date = datetime.strptime(start, "%Y%m%d")
        self.kiwoom.data_opt10086 = [] * 23
        self.kiwoom.set_input_value("????", code)
        self.kiwoom.set_input_value("????", date)
        self.kiwoom.set_input_value("????", 1)
        self.kiwoom.comm_rq_data("??????", "opt10086", 0, "0101")
        while self.kiwoom.inquiry == '2':
            time.sleep(TR_REQ_TIME_INTERVAL)
            self.kiwoom.set_input_value("????", code)
            self.kiwoom.set_input_value("????", date)
            self.kiwoom.set_input_value("????", 1)
            self.kiwoom.comm_rq_data("??????", "opt10086", 2, "0101")
        self.kiwoom.data_opt10086.index = self.kiwoom.data_opt10086.loc[:, '??']
        return self.kiwoom.data_opt10086
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.h5')

    try:
        df_twitter = pd.read_hdf(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_hdf(file_name, key='df', format='fixed')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_append_hierarchical(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['foo', 'bar'])
        df = DataFrame(np.random.randn(10, 3), index=index,
                       columns=['A', 'B', 'C'])

        with ensure_clean_store(self.path) as store:
            store.append('mi', df)
            result = store.select('mi')
            tm.assert_frame_equal(result, df)

            # GH 3748
            result = store.select('mi', columns=['A', 'B'])
            expected = df.reindex(columns=['A', 'B'])
            tm.assert_frame_equal(result, expected)

        with ensure_clean_path('test.hdf') as path:
            df.to_hdf(path, 'df', format='table')
            result = read_hdf(path, 'df', columns=['A', 'B'])
            expected = df.reindex(columns=['A', 'B'])
            tm.assert_frame_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_duplicate_column_name(self):
        df = DataFrame(columns=["a", "a"], data=[[0, 0]])

        with ensure_clean_path(self.path) as path:
            self.assertRaises(ValueError, df.to_hdf,
                              path, 'df', format='fixed')

            df.to_hdf(path, 'df', format='table')
            other = read_hdf(path, 'df')

            tm.assert_frame_equal(df, other)
            self.assertTrue(df.equals(other))
            self.assertTrue(other.equals(df))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_colums_multiindex_modified(self):
        # BUG: 7212
        # read_hdf store.select modified the passed columns parameters
        # when multi-indexed.

        df = DataFrame(np.random.rand(4, 5),
                       index=list('abcd'),
                       columns=list('ABCDE'))
        df.index.name = 'letters'
        df = df.set_index(keys='E', append=True)

        data_columns = df.index.names + df.columns.tolist()
        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df',
                      mode='a',
                      append=True,
                      data_columns=data_columns,
                      index=False)
            cols2load = list('BCD')
            cols2load_original = list(cols2load)
            df_loaded = read_hdf(path, 'df', columns=cols2load)  # noqa
            self.assertTrue(cols2load_original == cols2load)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_read_hdf_open_store(self):
        # GH10330
        # No check for non-string path_or-buf, and no test of open store
        df = DataFrame(np.random.rand(4, 5),
                       index=list('abcd'),
                       columns=list('ABCDE'))
        df.index.name = 'letters'
        df = df.set_index(keys='E', append=True)

        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df', mode='w')
            direct = read_hdf(path, 'df')
            store = HDFStore(path, mode='r')
            indirect = read_hdf(store, 'df')
            tm.assert_frame_equal(direct, indirect)
            self.assertTrue(store.is_open)
            store.close()
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_complex_fixed(self):
        df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
                       index=list('abcd'),
                       columns=list('ABCDE'))

        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df')
            reread = read_hdf(path, 'df')
            assert_frame_equal(df, reread)

        df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
                       index=list('abcd'),
                       columns=list('ABCDE'))
        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df')
            reread = read_hdf(path, 'df')
            assert_frame_equal(df, reread)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_complex_table(self):
        df = DataFrame(np.random.rand(4, 5).astype(np.complex64),
                       index=list('abcd'),
                       columns=list('ABCDE'))

        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df', format='table')
            reread = read_hdf(path, 'df')
            assert_frame_equal(df, reread)

        df = DataFrame(np.random.rand(4, 5).astype(np.complex128),
                       index=list('abcd'),
                       columns=list('ABCDE'))

        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df', format='table', mode='w')
            reread = read_hdf(path, 'df')
            assert_frame_equal(df, reread)
项目:CrowdTLL    作者:cbrafter    | 项目源码 | 文件源码
def updateResults(filename, initial, timeScore):
    """ Filename (tihout extension). open the filename.hdf and store results.
    then write the results to a HTML file
    """
    hdfFile = filename+'.hdf'
    if not os.path.exists(hdfFile):
        copyfile('./data/default.hdf', hdfFile)
    data = pd.read_hdf(hdfFile).reset_index()
    # add new entry to data frame
    data.loc[len(data)] = [initial, timeScore]
    # rank best unique user scores
    sortData = data.groupby(['INITIALS']).min().head(10)
    # save new data
    sortData.to_hdf(hdfFile, 'test', mode='w')

    sortData = sortData.sort_values(by='TIME',ascending=True).reset_index()
    htmlTable(sortData, filename+'.html')
项目:KAGGLE_AVITO_2016    作者:ZFTurbo    | 项目源码 | 文件源码
def read_test_train(train_size):
    print("Load train.csv")
    train = pd.read_hdf("../modified_data/train_original.csv.hdf", 'table')
    null_count = train.isnull().sum().sum()
    if null_count > 0:
        print('Nans:', null_count)
        cols = train.isnull().any(axis=0)
        print(cols[cols == True])
        rows = train.isnull().any(axis=1)
        print(rows[rows == True])
        print('NANs in train, please check it!')
        exit()
    split = round((1-train_size)*len(train.index))
    train = train[split:]
    print("Load test.csv")
    test = pd.read_hdf("../modified_data/test.hdf", 'table')
    null_count = test.isnull().sum().sum()
    if null_count > 0:
        print('Nans:', null_count)
        cols = test.isnull().any(axis=0)
        print(cols[cols == True])
        print('NANs in test, please check it!')
        exit()
    features = get_features(train, test)
    return train, test, features
项目:Attention_Based_LSTM_AspectBased_SA    作者:gangeshwark    | 项目源码 | 文件源码
def preprocess_day(a, b):
    a = pd.read_csv('data/restaurants_train_data.tsv', delimiter='\t')
    b = pd.read_csv('data/restaurants_test_data.tsv', delimiter='\t')
    print(a['text'][10])
    a['text'] = a['text'].apply(clean)
    b['text'] = b['text'].apply(clean)

    # save pre-processed data as pickle file
    a.to_hdf('data/restaurants_train_data_processed.h5', 'table')
    b.to_hdf('data/restaurants_test_data_processed.h5', 'table')
    # load pre-processed pickle data
    a = pd.read_hdf('data/restaurants_train_data_processed.h5', 'table')
    a['text'] = a['text'].apply(ast.literal_eval)
    b = pd.read_hdf('data/restaurants_test_data_processed.h5', 'table')
    b['text'] = b['text'].apply(ast.literal_eval)

    print(a['text'][10])
项目:kaggle_bnp-paribas    作者:ArdalanM    | 项目源码 | 文件源码
def LoadParseData(filename):

    data_name = filename.split('_')[0]
    pd_data = pd.read_hdf(CODE_FOLDER + "data/" + filename)
    cols_features = pd_data.drop(['ID', 'target'], 1).columns.tolist()

    pd_train = pd_data[pd_data.target >= 0]
    pd_test = pd_data[pd_data.target == -1]

    Y = pd_train['target'].values.astype(int)
    test_idx = pd_test['ID'].values.astype(int)

    X = np.array(pd_train.drop(['ID', 'target'],1))
    X_test = np.array(pd_test.drop(['ID','target'], 1))

    return X, Y, X_test, test_idx, pd_data, data_name, cols_features
项目:real_estate    作者:cooperoelrichs    | 项目源码 | 文件源码
def update_data_using_real_data(self):
        # This test is slow, and so is excluded from the standard test suite.
        current_data = pd.read_hdf(self.TEST_CURRENT_DATA_FILE)
        new_data = pd.read_hdf(self.TEST_NEW_DATA_FILE)
        expected_updated_data = pd.read_hdf(self.TEST_UPDATED_DATA_FILE)

        resultant_updated_data = DataStorer.update_data(
            current_data.copy(), new_data.copy())

        self.assertEqual(
            expected_updated_data.shape, resultant_updated_data.shape)
        self.assertTrue(
            expected_updated_data.equals(resultant_updated_data))

        repeatedly_updated_data = DataStorer.update_data(
            resultant_updated_data.copy(), new_data.copy())

        self.assertEqual(
            expected_updated_data.shape, repeatedly_updated_data.shape)
        self.assertTrue(
            expected_updated_data.equals(repeatedly_updated_data))
项目:ranking    作者:wattlebird    | 项目源码 | 文件源码
def __init__(self, table=None, filename=''):
        """
        table:      the pandas DataFrame that records rankable objects competition
                    record
        filename:   the hdf5 filename that stores the DataFrame. The DataFrame
                    must be indexed by 'item_pair_rate'.
        """
        if table is None:
            table = pd.read_hdf(filename, "item_pair_rate")
        table = table[['primary','secondary','rate1','rate2','weight']]
        self.table = table
        # itemid to index table
        idx = self._extract_list(self.table)
        self.itemlist = idx
        temptable = table.iloc[:,:2].values
        pair = np.fromfunction(np.vectorize(lambda i, j: idx[temptable[i,j]]),
                        temptable.shape)
        pair = np.require(pair, dtype=np.int32)
        self.pair = pair
项目:TencentAD_contest    作者:zsyandjyhouse    | 项目源码 | 文件源码
def delete_conversion_data():
    train_data = pd.read_hdf(FilePath + 'train_0613_nodelconvert')
    print 'read finish'
    advertiser_conversion_list = find_delete_advertiser()
    print len(advertiser_conversion_list)

    for item in advertiser_conversion_list:
        t = threading.Thread(target=get_index_to_delete,args=(train_data,item))
        t.start()
    while len(result_list)<len(advertiser_conversion_list):
        pass
    train_data.drop(delete_list, axis=0, inplace=True)
    train_data = train_data.reset_index()
    del train_data['index']
    print 'train write begin'
    train_data.to_hdf(FilePath + 'train_0613', 'all')
    delete_list = Series(delete_list)
    delete_list.to_csv(FilePath + 'delete_negsample_index_oftrain0613.csv', mode='a', index=False)
项目:TencentAD_contest    作者:zsyandjyhouse    | 项目源码 | 文件源码
def time_delta_fentong():
    train_data = pd.read_hdf('../../gen/train_0626')
    test_data = pd.read_hdf('../../gen/test_0626')
    print 'read finish'
    train_data['time_delta_user_creative_next_fentong'] = train_data['time_delta_user_creative_next'].map(time_delta_map)
    test_data['time_delta_user_creative_next_fentong'] = test_data['time_delta_user_creative_next'].map(time_delta_map)
    train_data['time_delta_user_creative_fentong'] = train_data['time_delta_user_creative'].map(time_delta_map)
    test_data['time_delta_user_creative_fentong'] = test_data['time_delta_user_creative'].map(time_delta_map)
    train_data['time_delta_user_app_next_fentong'] = train_data['time_delta_user_app_next'].map(time_delta_map)
    test_data['time_delta_user_app_next_fentong'] = test_data['time_delta_user_app_next'].map(time_delta_map)
    train_data['time_delta_user_app_fentong'] = train_data['time_delta_user_app'].map(time_delta_map)
    test_data['time_delta_user_app_fentong'] = test_data['time_delta_user_app'].map(time_delta_map)
    train_data['time_delta_user_next_fentong'] = train_data['time_delta_user_next'].map(time_delta_map)
    test_data['time_delta_user_next_fentong'] = test_data['time_delta_user_next'].map(time_delta_map)
    train_data['time_delta_user_fentong'] = train_data['time_delta_user'].map(time_delta_map)
    test_data['time_delta_user_fentong'] = test_data['time_delta_user'].map(time_delta_map)
    print test_data

    train_data.to_hdf('../../gen/train_0626_delta_fentong','all')
    test_data.to_hdf('../../gen/test_0626_delta_fentong','all')
项目:Ads-RecSys-Datasets    作者:Atomu2014    | 项目源码 | 文件源码
def __iter__(self, gen_type='train', batch_size=None, shuffle_block=False, random_sample=False, split_fields=False,
                 on_disk=True, squeeze_output=False, **kwargs):
        gen_type = gen_type.lower()

        if on_disk:
            print('on disk...')

            for hdf_X, hdf_y in self._files_iter_(gen_type=gen_type, shuffle_block=shuffle_block):
                # num_of_lines = pd.HDFStore(hdf_y, mode='r').get_storer('fixed').shape[0]

                X_all = pd.read_hdf(hdf_X, mode='r').as_matrix()
                y_all = pd.read_hdf(hdf_y, mode='r').as_matrix()

                gen = self.generator(X_all, y_all, batch_size, shuffle=random_sample)
                for X, y in gen:
                    if split_fields:
                        X = np.split(X, self.max_length, axis=1)
                        for i in range(self.max_length):
                            X[i] -= self.feat_min[i]
                    if squeeze_output:
                        y = y.squeeze()
                    yield X, y
        else:
            print('not implemented')
项目:Ads-RecSys-Datasets    作者:Atomu2014    | 项目源码 | 文件源码
def bin_count(hdf_data_dir, file_prefix, num_of_parts):
        """
        count positive/negative samples
        :param hdf_data_dir: 
        :param file_prefix: see this param in feature_to_hdf()
        :param num_of_parts: 
        :return: size of a dataset, positive samples, negative samples, positive ratio
        """
        size = 0
        num_of_pos = 0
        num_of_neg = 0
        for part in range(num_of_parts):
            _y = pd.read_hdf(os.path.join(hdf_data_dir, file_prefix + '_output_part_' + str(part) + '.h5'), mode='r')
            part_pos_num = _y.loc[_y.iloc[:, 0] == 1].shape[0]
            part_neg_num = _y.shape[0] - part_pos_num
            size += _y.shape[0]
            num_of_pos += part_pos_num
            num_of_neg += part_neg_num
        pos_ratio = 1.0 * num_of_pos / (num_of_pos + num_of_neg)
        return size, num_of_pos, num_of_neg, pos_ratio
项目:acton    作者:chengsoonong    | 项目源码 | 文件源码
def try_pandas(data_path: str) -> bool:
    """Guesses if a file is a pandas file.

    Parameters
    ----------
    data_path
        Path to file.

    Returns
    -------
    bool
        True if the file is pandas.
    """
    try:
        pandas.read_hdf(data_path)
    except ValueError:
        return False

    return True
项目:toll_road    作者:idosekely    | 项目源码 | 文件源码
def _reader(self):
        if not self.does_exist():
            return
        return pd.read_hdf(self.data_file, 'data')
项目:scikit-dataaccess    作者:MITHaystack    | 项目源码 | 文件源码
def retrieveCommonDatesHDF(support_data_filename, key_list, in_date_list):
    '''
    Get a list of all dates that have  data available

    @support_data_filename: Filename of support data
    @in_date_list: Input date list to check

    @return dictionary of dates with data
    '''

    valid_dates = OrderedDict()

    support_full_path = resource_filename('skdaccess',os.path.join('support',support_data_filename))

    for key in key_list:

        try:
            available_dates = pd.read_hdf(support_full_path, key)
        except KeyError:
            print('Unknown station:',key)

        common_dates = list(set(in_date_list).intersection(set(available_dates)))

        common_dates.sort()

        valid_dates[key] = common_dates

    return valid_dates
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def train(self):
        """"""
        start = time.time()

        print('size before truncated outliers is %d ' % len(self.TrainData))
        TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
        print('size after truncated outliers is %d ' % len(self.TrainData))

        TrainData['longitude'] -= -118600000
        TrainData['latitude'] -= 34220000
        #extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')
        #self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)

        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['logerror']
        self._l_train_columns = X.columns
        X = X.values.astype(np.float32, copy=False)

        lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True)
        self._model = lr.fit(X, Y)
        end = time.time()

        print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))

        self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
                                                            datetime.now().strftime('%Y%m%d-%H:%M:%S'))
        #with open(self._f_eval_train_model, 'wb') as o_file:
        #    pickle.dump(self._model, o_file, -1)
        #o_file.close()

        #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
        #                           ignore_index=True)  ## ignore_index will reset the index or index will be overlaped

        return
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def evaluate(self):
        """"""
        ## not truncate outliers
        pred_valid = pd.DataFrame(index=self.ValidData.index)
        pred_valid['parcelid'] = self.ValidData['parcelid']

        truth_valid = pd.DataFrame(index=self.ValidData.index)
        truth_valid['parcelid'] = self.ValidData['parcelid']

        start = time.time()

        for d in self._l_valid_predict_columns:
            l_valid_columns = ['%s%s' % (c, d) if (c in ['lastgap', 'monthyear', 'buildingage']) else c for c in
                               self._l_train_columns]

            extra_va = pd.read_hdf(path_or_buf='%s/p21/eval_valid_%s.hdf' % (self.InputDir, d), key='valid')
            #ValidData = self.ValidData.join(extra_va, on= 'parcelid', how= 'left')
            ValidData = pd.concat([self.ValidData, extra_va.drop('parcelid', axis= 1)], axis= 1)

            x_valid = ValidData[l_valid_columns]
            x_valid = x_valid.values.astype(np.float32, copy=False)
            pred_valid[d] = self._model.predict(x_valid)  # * 0.99 + 0.011 * 0.01
            df_tmp = ValidData[ValidData['transactiondate'].dt.month == int(d[-2:])]
            truth_valid.loc[df_tmp.index, d] = df_tmp['logerror']

        score = 0.0
        ae = np.abs(pred_valid - truth_valid)
        for col in ae.columns:
            score += np.sum(ae[col])
        score /= len(pred_valid)  ##!! divided by number of instances, not the number of 'cells'
        print('============================= ')
        print('Local MAE is %.6f' % score)
        print('=============================')

        end = time.time()

        del self.ValidData
        gc.collect()

        print('time elapsed %ds' % (end - start))
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def train(self):
        """"""
        start = time.time()

        extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train')

        print('size before truncated outliers is %d ' % len(self.TrainData))
        self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)]
        #self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left')
        self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1)
        print('size after truncated outliers is %d ' % len(self.TrainData))

        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['logerror']
        self._l_train_columns = X.columns
        X = X.values.astype(np.float32, copy=False)

        lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel)
        self._model = lr.fit(X, Y)
        end = time.time()

        print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start)))

        self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,
                                                            datetime.now().strftime('%Y%m%d-%H:%M:%S'))
        with open(self._f_eval_train_model, 'wb') as o_file:
            pickle.dump(self._model, o_file, -1)
        o_file.close()

        #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]],
        #                           ignore_index=True)  ## ignore_index will reset the index or index will be overlaped

        return
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def evaluate(self):
        """"""
        ## not truncate outliers
        pred_valid = pd.DataFrame(index=self.ValidData.index)
        pred_valid['parcelid'] = self.ValidData['parcelid']

        truth_valid = pd.DataFrame(index=self.ValidData.index)
        truth_valid['parcelid'] = self.ValidData['parcelid']

        start = time.time()

        for d in self._l_valid_predict_columns:
            l_valid_columns = ['%s%s' % (c, d) if (c in ['lastgap', 'monthyear', 'buildingage']) else c for c in
                               self._l_train_columns]

            extra_va = pd.read_hdf(path_or_buf='%s/p21/eval_valid_%s.hdf' % (self.InputDir, d), key='valid')
            #ValidData = self.ValidData.join(extra_va, on= 'parcelid', how= 'left')
            ValidData = pd.concat([self.ValidData, extra_va.drop('parcelid', axis= 1)], axis= 1)

            x_valid = ValidData[l_valid_columns]
            x_valid = x_valid.values.astype(np.float32, copy=False)
            pred_valid[d] = self._model.predict(x_valid)  # * 0.99 + 0.011 * 0.01
            df_tmp = ValidData[ValidData['transactiondate'].dt.month == int(d[-2:])]
            truth_valid.loc[df_tmp.index, d] = df_tmp['logerror']

        score = 0.0
        ae = np.abs(pred_valid - truth_valid)
        for col in ae.columns:
            score += np.sum(ae[col])
        score /= len(pred_valid)  ##!! divided by number of instances, not the number of 'cells'
        print('============================= ')
        print('Local MAE is %.6f' % score)
        print('=============================')

        end = time.time()

        del self.ValidData
        gc.collect()

        print('time elapsed %ds' % (end - start))
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def LoadFromHdfFile(InputDir, mode = 'train'):

        if(mode == 'train'):
            data = pd.read_hdf(path_or_buf= '%s/train.hdf' % InputDir, key='train')
        elif(mode == 'valid'):
            data = pd.read_hdf(path_or_buf= '%s/valid.hdf' % InputDir, key='valid')
        else:
            data = pd.read_hdf(path_or_buf= '%s/test.hdf' % InputDir, key='test')

        return data

    ## class method, load data with pkl format
项目:triage    作者:dssg    | 项目源码 | 文件源码
def recover_matrix(config, directory='.'):
    """Recover a matrix by either its config or uuid.

    Parameters
    ----------
    config: str or dict
        config metadata for the matrix or uuid
    directory: str
        path to search for the matrix

    Returns
    -------
    df_matrix: DataFrame
        DataFrame of specified matrix
    None:
        If no matrix matrix is found
    """

    if isinstance(config, dict):
        uuid = generate_uuid(config)
    else:
        uuid = config

    fname = directory + '/' + uuid

    if os.path.isfile(fname + '.h5'):
        df_matrix = pd.read_hdf(fname + '.h5')
        return df_matrix
    elif os.path.isfile(fname + '.csv'):
        df_matrix = pd.read_csv(fname + '.csv')
        return df_matrix
    else:
        return None
项目:triage    作者:dssg    | 项目源码 | 文件源码
def get_matrix_and_metadata(matrix_path, metadata_path):
    """Retrieve a matrix in hdf format and
    metadata about the matrix in yaml format

    Returns: (tuple) matrix, metadata
    """
    matrix = pandas.read_hdf(matrix_path)
    with open(metadata_path) as f:
        metadata = yaml.load(f)
    return matrix, metadata
项目:keras-molecules    作者:maxhodak    | 项目源码 | 文件源码
def read_smiles_data(filename):
    import pandas as pd
    h5f = pd.read_hdf(filename, 'table')
    data = h5f['structure'][:]
    # import gzip
    # data = [line.split()[0].strip() for line in gzip.open(filename) if line]
    return data
项目:DREAM_invivo_tf_binding_prediction_challenge_baseline    作者:nboley    | 项目源码 | 文件源码
def __init__(self,
                 labels_fname,
                 regions_fname=None,
                 max_n_rows=None,
                 load_cached=True):
        self.labels_fname = labels_fname
        self.regions_fname = regions_fname
        self.max_n_rows = max_n_rows
        self._hash = None
        self.load_cached = load_cached
        # extract the sample names from the header
        #assert labels_fname.endswith("labels.tsv.gz"), \
        #    "Unrecognized labels filename '%s'" % labels_fname
        self._init_header_data(labels_fname)
        # extract the factor from the filename
        self.factor = os.path.basename(labels_fname).split('.')[0]

        # if we want to use a cached version...
        if self.load_cached is True:
            try:
                print "Loading '%s'" % self.cached_fname
                self.h5store = h5py.File(self.cached_fname)
                self.data = pd.read_hdf(self.cached_fname, 'data')
            except KeyError:
                self.data = self._build_dataframe()
                self.data.to_hdf(self.cached_fname, 'data')
                print self.h5store
        else:
            self.data = self._build_dataframe()

        return
项目:DREAM_invivo_tf_binding_prediction_challenge_baseline    作者:nboley    | 项目源码 | 文件源码
def load_or_build_motif_scores(self, fasta_fname):
        try:
            self.motif_scores = pd.read_hdf(self.cached_fname, 'motif_scores')
            self.motif_scores.index = self.data.index
        except KeyError:
            self.motif_scores = self.build_motif_scores(fasta_fname)
            self.motif_scores.to_hdf(self.cached_fname, 'motif_scores')
        return self.motif_scores
项目:DREAM_invivo_tf_binding_prediction_challenge_baseline    作者:nboley    | 项目源码 | 文件源码
def load_or_build_dnase_fc_scores(self):
        try:
            self.dnase_fc_scores = pd.read_hdf(self.cached_fname, 'dnase_scores')
        except KeyError:
            self.dnase_fc_scores = self.build_dnase_fc_scores()
            self.dnase_fc_scores.to_hdf(self.cached_fname, 'dnase_scores')
        except IOError:
            self.dnase_fc_scores = self.build_dnase_fc_scores()            
        return self.dnase_fc_scores
项目:catalyst    作者:enigmampc    | 项目源码 | 文件源码
def __init__(self, path):
        self._panel = pd.read_hdf(path)
项目:netwars    作者:i008    | 项目源码 | 文件源码
def main(batch_size=10000):
    posts_df = pd.read_hdf('nw_posts.hdf5', 'posts')
    index_posts_in_elastic(posts_df, batch_size=batch_size)
项目:Tensorflow-Softmax-NER-RNNLM    作者:queue-han    | 项目源码 | 文件源码
def load_wv_pandas(fname):
    return pd.read_hdf(fname, 'data')
项目:PythonTrading    作者:F2011B    | 项目源码 | 文件源码
def get_availableExchanges():
    SymbolsDF = pd.read_hdf(Constants.InputFolder + 'Symbols.hdf', 'Symbols')
    return SymbolsDF.EXCHANGE.drop_duplicates().values
项目:PythonTrading    作者:F2011B    | 项目源码 | 文件源码
def get_availableSymbols(SymbolFilter=None):
    SymbolsDF = pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'Symbols')

    if SymbolFilter == None :
        DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :]
        return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values

    if not ('Exchange' in SymbolFilter.keys()):
        DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == 'NYSE', :]
        return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values

    DFNew = SymbolsDF.loc[lambda DF: DF.EXCHANGE == SymbolFilter['Exchange'], :]
    return DFNew.loc[DFNew.SYMBOL.str.match('[A-Z]{1,4}$'), :].SYMBOL.values
项目:PythonTrading    作者:F2011B    | 项目源码 | 文件源码
def get_availableSymbols(SymbolFilter=None):
    DF=pd.read_hdf(Constants.InputFolder+'Symbols.hdf', 'OANDA')
    return DF.instrument.values
项目:PythonTrading    作者:F2011B    | 项目源码 | 文件源码
def main():
    DF = pd.read_hdf('/home/lc1bfrbl/Database/Oanda.hdf', 'WTICO_USD_H1')
    TTT=CalcTaylorCycle(DF)
    Index = (TTT.index.year == 2017) & (TTT.index.month == 6)
    TTT[Index].MO.plot()
    TTT[Index].MLo.plot()
    TTT[Index].MHi.plot()
    TTT[Index].High.plot()
    TTT[Index].Low.plot()
项目:PyTrader    作者:didw    | 项目源码 | 文件源码
def save_table(self, code, date):
        TR_REQ_TIME_INTERVAL = 4
        time.sleep(TR_REQ_TIME_INTERVAL)
        data_81 = self.wrapper.get_data_opt10081(code, date)
        time.sleep(TR_REQ_TIME_INTERVAL)
        data_86 = self.wrapper.get_data_opt10086(code, date)
        col_86 = ['???', '???', '??(??)', '???', '??', '??', '????', '???', '????',
                  '???', '????', '????', '????', '?????', '?????', '?????', '?????']
        data = pd.concat([data_81, data_86.loc[:, col_86]], axis=1)
        #con = sqlite3.connect("../data/stock.db")
        try:
            data = data.loc[data.index > int(self.kiwoom.start_date.strftime("%Y%m%d"))]
            #orig_data = pd.read_sql("SELECT * FROM '%s'" % code, con, index_col='??').sort_index()
            orig_data = pd.read_hdf("../data/hdf/%s.hdf" % code, 'day').sort_index()
            end_date = orig_data.index[-1]
            orig_data = orig_data.loc[orig_data.index < end_date]
            data = data.loc[data.index >= end_date]
            data = pd.concat([orig_data, data], axis=0)
        except (FileNotFoundError, IndexError) as e:
            print(e)
            pass
        finally:
            data.index.name = '??'
            if len(data) != 0:
                #data.to_sql(code, con, if_exists='replace')
                data.to_hdf('../data/hdf/%s.hdf'%code, 'day', mode='w')
项目:PyTrader    作者:didw    | 项目源码 | 文件源码
def read_h5():
    code_list = glob.glob('../data/stock/*.h5')
    for code in code_list[:10]:
        data = pd.read_hdf(code, 'table').sort_index()
        data = data.loc[data.index >= str(20160101)]
        data = data.loc[data.index <= str(20160630)]
        print(data.head())
项目:zeex    作者:zbarge    | 项目源码 | 文件源码
def superReadFile(filepath,**kwargs):
    """ 
    Uses pandas.read_excel (on excel files) and returns a dataframe of the first sheet (unless sheet is specified in kwargs)
    Uses superReadText (on .txt,.tsv, or .csv files) and returns a dataframe of the data.
    One function to read almost all types of data files.    
    """
    if isinstance(filepath, pd.DataFrame):
        return filepath

    ext = os.path.splitext(filepath)[1].lower()

    if ext in ['.xlsx', '.xls']:
        kwargs.pop('dtype', None)
        return pd.read_excel(filepath,**kwargs)

    elif ext in ['.txt','.tsv','.csv']:
        return superReadText(filepath, **kwargs)

    elif ext in ['.gz', '.bz2', '.zip', 'xz']:
        return superReadCSV(filepath, **kwargs)

    elif ext in ['.h5']:
        return pd.read_hdf(filepath)

    else:
        raise NotImplementedError("Unable to read '{}' files".format(ext))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_conv_read_write(self):
        path = create_tempfile(self.path)
        try:
            def roundtrip(key, obj, **kwargs):
                obj.to_hdf(path, key, **kwargs)
                return read_hdf(path, key)

            o = tm.makeTimeSeries()
            assert_series_equal(o, roundtrip('series', o))

            o = tm.makeStringSeries()
            assert_series_equal(o, roundtrip('string_series', o))

            o = tm.makeDataFrame()
            assert_frame_equal(o, roundtrip('frame', o))

            o = tm.makePanel()
            assert_panel_equal(o, roundtrip('panel', o))

            # table
            df = DataFrame(dict(A=lrange(5), B=lrange(5)))
            df.to_hdf(path, 'table', append=True)
            result = read_hdf(path, 'table', where=['index>2'])
            assert_frame_equal(df[df.index > 2], result)

        finally:
            safe_remove(path)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_round_trip_equals(self):
        # GH 9330
        df = DataFrame({"B": [1, 2], "A": ["x", "y"]})

        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df', format='table')
            other = read_hdf(path, 'df')
            tm.assert_frame_equal(df, other)
            self.assertTrue(df.equals(other))
            self.assertTrue(other.equals(df))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_to_hdf_with_object_column_names(self):
        # GH9057
        # Writing HDF5 table format should only work for string-like
        # column types

        types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex,
                             tm.makeDateIndex, tm.makeTimedeltaIndex,
                             tm.makePeriodIndex]
        types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex]

        if compat.PY3:
            types_should_run.append(tm.makeUnicodeIndex)
        else:
            types_should_fail.append(tm.makeUnicodeIndex)

        for index in types_should_fail:
            df = DataFrame(np.random.randn(10, 2), columns=index(2))
            with ensure_clean_path(self.path) as path:
                with self.assertRaises(
                        ValueError, msg=("cannot have non-object label "
                                         "DataIndexableCol")):
                    df.to_hdf(path, 'df', format='table', data_columns=True)

        for index in types_should_run:
            df = DataFrame(np.random.randn(10, 2), columns=index(2))
            with ensure_clean_path(self.path) as path:
                df.to_hdf(path, 'df', format='table', data_columns=True)
                result = pd.read_hdf(
                    path, 'df', where="index = [{0}]".format(df.index[0]))
                assert(len(result))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_read_hdf_errors(self):
        df = DataFrame(np.random.rand(4, 5),
                       index=list('abcd'),
                       columns=list('ABCDE'))

        with ensure_clean_path(self.path) as path:
            self.assertRaises(IOError, read_hdf, path, 'key')
            df.to_hdf(path, 'df')
            store = HDFStore(path, mode='r')
            store.close()
            self.assertRaises(IOError, read_hdf, store, 'df')
            with open(path, mode='r') as store:
                self.assertRaises(NotImplementedError, read_hdf, store, 'df')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_read_nokey(self):
        df = DataFrame(np.random.rand(4, 5),
                       index=list('abcd'),
                       columns=list('ABCDE'))
        with ensure_clean_path(self.path) as path:
            df.to_hdf(path, 'df', mode='a')
            reread = read_hdf(path)
            assert_frame_equal(df, reread)
            df.to_hdf(path, 'df2', mode='a')
            self.assertRaises(ValueError, read_hdf, path)