我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.date_range()。
def fill_nans(df, delta=None): """ """ if not delta: dt_diff = NP.diff(df.index.values) delta_timedelta64 = min(dt_diff) delta_seconds = delta_timedelta64 / NP.timedelta64(1, 's') delta = timedelta(seconds=delta_seconds) logger.info('Using delta = {} (s)'.format(delta.total_seconds())) index_new = PD.date_range(start=df.index[0], end=df.index[-1], freq=delta) missing = sorted(set(index_new) - set(df.index)) if missing: logger.warning('Missing time indices (filled by NaNs):') for x in missing: logger.warning(x) return df.reindex(index_new, copy=False), delta
def test_nan_filter_dataframe(self): dates = pd.date_range('1/1/2000', periods=2, freq='B', tz='UTC') df = pd.DataFrame(np.random.randn(2, 2), index=dates, columns=[4, 5]) # should be filtered df.loc[dates[0], 4] = np.nan # should not be filtered, should have been ffilled df.loc[dates[1], 5] = np.nan source = DataFrameSource(df) event = next(source) self.assertEqual(5, event.sid) event = next(source) self.assertEqual(4, event.sid) event = next(source) self.assertEqual(5, event.sid) self.assertFalse(np.isnan(event.price))
def test_nan_filter_panel(self): dates = pd.date_range('1/1/2000', periods=2, freq='B', tz='UTC') df = pd.Panel(np.random.randn(2, 2, 2), major_axis=dates, items=[4, 5], minor_axis=['price', 'volume']) # should be filtered df.loc[4, dates[0], 'price'] = np.nan # should not be filtered, should have been ffilled df.loc[5, dates[1], 'price'] = np.nan source = DataPanelSource(df) event = next(source) self.assertEqual(5, event.sid) event = next(source) self.assertEqual(4, event.sid) self.assertRaises(StopIteration, next, source)
def getCalendar(self): """ ????? :return: """ # ??????? tradecalendar = pd.DataFrame(data=pd.date_range(self.begin, self.end), columns=['date']) # ?????????? types, weekdays = self._weekend_trade_day_type(tradecalendar["date"]) tradecalendar["type"] = types tradecalendar["weekday"] = weekdays tradecalendar["weekday"] += 1 tradecalendar = tradecalendar.set_index("date", drop=False) # ????????? tradecalendar = self._holiday_trade_day_type(tradecalendar) # ?????? tradecalendar = self._tradestatus(tradecalendar) return tradecalendar
def make_features(user_id,user_df): """ ?????? """ print 'user_id:', user_id power = user_df.power_consumption assert power.index[0] == user_df.index[0] assert len(user_df.index) == 639 new_df = pd.DataFrame(index=user_df.index.union(pd.date_range('2016-9-1','2016-9-30'))) pw_new = power.copy() #predict 30 days and 30days for features for d in range(60): pw_new.index += pd.Timedelta('1D') new_df['power#-%d'%(d+1)] = pw_new #create 30 models for d in range(30): #30 days features x_ = new_df[new_df.columns[d:30+d]] x_['y'] = power x_.to_csv('./features/day_model/%d/%d.csv'%(d+1,user_id)) #return x_
def make_month_features(user_id,user_df): """ ?????? """ print 'user_id:', user_id power = user_df.power_consumption.copy() assert power.index[0] == user_df.index[0] new_df = pd.DataFrame(index=user_df.index.union(pd.date_range('2016-10-1','2016-10-31'))) pw_new = power.copy() #predict 30 days and 30days for features for d in range(30): pw_new.index += pd.Timedelta('1D') new_df['power#-%d'%(d+1)] = pw_new #create 30 models for d in range(31): #30 days features new_df['y#%d'%d] = power power.index -= pd.Timedelta('1D') save_month_df(new_df,user_id) return new_df
def create_energysystem(nodes, **arguments): """Creates the energysystem. Parameters ---------- nodes: A list of entities that comprise the energy system **arguments : key word arguments Arguments passed from command line """ datetime_index = pd.date_range(arguments['--date-from'], arguments['--date-to'], freq='60min') es = EnergySystem(entities=nodes, groupings=GROUPINGS, timeindex=datetime_index) return es
def _from_dataset_test_variables(self): """The variables and coords needed for the from_dataset tests""" variables = { # 3d-variable 'v0': xr.Variable(('time', 'ydim', 'xdim'), np.zeros((4, 4, 4))), # 2d-variable with time and x 'v1': xr.Variable(('time', 'xdim', ), np.zeros((4, 4))), # 2d-variable with y and x 'v2': xr.Variable(('ydim', 'xdim', ), np.zeros((4, 4))), # 1d-variable 'v3': xr.Variable(('xdim', ), np.zeros(4))} coords = { 'ydim': xr.Variable(('ydim', ), np.arange(1, 5)), 'xdim': xr.Variable(('xdim', ), np.arange(4)), 'time': xr.Variable( ('time', ), pd.date_range('1999-01-01', '1999-05-01', freq='M').values)} return variables, coords
def make_features(locations_file='blocations.csv', timeseries_file='burundioutput.csv', startdate='2015-05-01'): locations = pd.read_csv(locations_file) timeseries = pd.read_csv(timeseries_file) n_days = timeseries.shape[0] # Construct an index with real dates rather than day numbers timeseries.index = pd.date_range(startdate, periods=n_days) features = [] for location in locations.itertuples(name='Location'): latlon = (location.latitude, location.longitude) loctype_by_day = get_loctype(location, timeseries.index) population_by_day = get_population(timeseries, location.name) data_for_location = pd.DataFrame({'loctype': loctype_by_day, 'population': population_by_day}) feature = mgj.make_gj_points(latlon, location.name, data_for_location) features.extend(feature) return features
def test_make_gj_points(): index = pandas.date_range('2015-3-1', periods=100) popn = pandas.Series([n * 500 for n in range(100)], index=index) loctype = pandas.Series((['city'] * 50) + (['conflict'] * 50), index=index) timeseries = pandas.DataFrame({'loctype': loctype, 'population': popn}) res = make_geojson.make_gj_points((52.0, 0.0), 'Examplecamp', timeseries) assert len(res) == 100 assert res[0]['type'] == 'Feature' assert res[0]['properties']['start'] == '2015-03-01' assert res[0]['properties']['end'] == '2015-03-02' assert res[0]['properties']['loctype'] == 'city' assert res[0]['geometry']['coordinates'] == (0.0, 52.0) assert res[50]['properties']['loctype'] == 'conflict' assert res[50]['properties']['start'] == '2015-04-20'
def date_range_index(self, start, end=None, by=24): """ return a (list of) time sequence that allow indexing one or several time intervals between start and end every 'by' hours if end is None, only one time interval of 'by' hours is returned start and end are expected in local time """ if end is None: seq = pandas.date_range(start=start, periods=by, freq='H', tz=self.timezone.zone) return seq.tz_convert('UTC') else: seq = pandas.date_range(start=start, end=end, freq='H', tz=self.timezone.zone) seq = seq.tz_convert('UTC') bins = pandas.date_range(start=start, end=end, freq=str(by) + 'H', tz=self.timezone.zone) bins = bins.tz_convert('UTC') return [seq[(seq >= bins[i]) & (seq < bins[i + 1])] for i in range(len(bins) - 1)]
def __init__(self, year, seasons=None, holidays=None): if calendar.isleap(year): hoy = 8784 else: hoy = 8760 self.datapath = os.path.join(os.path.dirname(__file__), 'bdew_data') self.date_time_index = pd.date_range( pd.datetime(year, 1, 1, 0), periods=hoy * 4, freq='15Min') if seasons is None: self.seasons = { 'summer1': [5, 15, 9, 14], # summer: 15.05. to 14.09 'transition1': [3, 21, 5, 14], # transition1 :21.03. to 14.05 'transition2': [9, 15, 10, 31], # transition2 :15.09. to 31.10 'winter1': [1, 1, 3, 20], # winter1: 01.01. to 20.03 'winter2': [11, 1, 12, 31], # winter2: 01.11. to 31.12 } else: self.seasons = seasons self.year = year self.slp_frame = self.all_load_profiles(self.date_time_index, holidays=holidays)
def date_op(): start = pd.date_range('2015-01-01', periods=50) #print start print type(start) date_list = [datetime.datetime(2017, 1, 1), datetime.datetime(2017, 1, 2), datetime.datetime(2017, 1, 3), datetime.datetime(2017, 1, 4)] df = pd.DataFrame(np.random.randn(4), index=date_list) print df print df.index[2] format_line() s_x = pd.date_range('2000-1-1', periods=1000) df_x = pd.DataFrame(np.arange(2000).reshape(1000, 2), index=s_x) print df_x print df_x.ix['2002/09/24'] print df_x[1] #????????? #?????ix print df_x.ix['2001-09']
def convert_data_to_timeseries(input_file, column, verbose=False): # Load the input file data = np.loadtxt(input_file, delimiter=',') # Extract the start and end dates start_date = str(int(data[0,0])) + '-' + str(int(data[0,1])) end_date = str(int(data[-1,0] + 1)) + '-' + str(int(data[-1,1] % 12 + 1)) if verbose: print "\nStart date =", start_date print "End date =", end_date # Create a date sequence with monthly intervals dates = pd.date_range(start_date, end_date, freq='M') # Convert the data into time series data data_timeseries = pd.Series(data[:,column], index=dates) if verbose: print "\nTime series data:\n", data_timeseries[:10] return data_timeseries
def get_gsod_data(self, station, year): filename_format = '/pub/data/gsod/{year}/{station}-{year}.op.gz' lines = self._retreive_file_lines(filename_format, station, year) dates = pd.date_range("{}-01-01 00:00".format(year), "{}-12-31 00:00".format(year), freq='D', tz=pytz.UTC) series = pd.Series(None, index=dates, dtype=float) for line in lines[1:]: columns = line.split() date_str = columns[2].decode('utf-8') temp_F = float(columns[3]) temp_C = (5. / 9.) * (temp_F - 32.) dt = pytz.UTC.localize(datetime.strptime(date_str, "%Y%m%d")) series[dt] = temp_C return series
def get_isd_data(self, station, year): filename_format = '/pub/data/noaa/{year}/{station}-{year}.gz' lines = self._retreive_file_lines(filename_format, station, year) dates = pd.date_range("{}-01-01 00:00".format(year), "{}-12-31 23:00".format(int(year) + 1), freq='H', tz=pytz.UTC) series = pd.Series(None, index=dates, dtype=float) for line in lines: if line[87:92].decode('utf-8') == "+9999": temp_C = float("nan") else: temp_C = float(line[87:92]) / 10. date_str = line[15:27].decode('utf-8') # there can be multiple readings per hour, so set all to minute 0 dt = pytz.UTC.localize(datetime.strptime(date_str, "%Y%m%d%H%M")).replace(minute=0) # only set the temp if it's the first encountered in the hour. if pd.isnull(series.ix[dt]): series[dt] = temp_C return series
def test_to_records(serializer): data = {"value": [1, np.nan], "estimated": [True, False]} columns = ["value", "estimated"] index = pd.date_range('2000-01-01', periods=2, freq='D') df = pd.DataFrame(data, index=index, columns=columns) records = serializer.to_records(df) assert len(records) == 2 assert records[0]["start"] == datetime(2000, 1, 1, tzinfo=pytz.UTC) assert records[0]["value"] == 1 assert records[0]["estimated"] assert records[1]["start"] == datetime(2000, 1, 2, tzinfo=pytz.UTC) assert pd.isnull(records[1]["value"]) assert not records[1]["estimated"]
def test_to_records(serializer): data = {"value": [1, np.nan], "estimated": [True, False]} columns = ["value", "estimated"] index = pd.date_range('2000-01-01', periods=2, freq='D') df = pd.DataFrame(data, index=index, columns=columns) records = serializer.to_records(df) assert len(records) == 2 assert records[0]["end"] == datetime(2000, 1, 1, tzinfo=pytz.UTC) assert pd.isnull(records[0]["value"]) assert not records[0]["estimated"] assert records[1]["end"] == datetime(2000, 1, 2, tzinfo=pytz.UTC) assert records[1]["value"] == 1 assert records[1]["estimated"]
def meter_input_daily(project_meter_input): record_starts = pd.date_range( '2012-01-01', periods=365 * 4, freq='D', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0, "estimated": False } for dt in record_starts ] trace = _natural_gas_input(records) trace.update({'interval': 'daily'}) meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": trace, "project": project_meter_input, } return meter_input
def meter_input_hourly(project_meter_input): record_starts = pd.date_range( '2012-01-01', periods=365 * 4 * 24, freq='H', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0 + dt.hour, "estimated": False } for dt in record_starts ] trace = _natural_gas_input(records) trace.update({'interval': 'hourly'}) meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": trace, "project": project_meter_input, } return meter_input
def meter_input_daily_baseline_only(project_meter_input): record_starts = pd.date_range( '2012-01-01', periods=365 * 1, freq='D', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0, "estimated": False } for dt in record_starts ] meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": _natural_gas_input(records), "project": project_meter_input, } return meter_input
def meter_input_daily_reporting_only(project_meter_input): record_starts = pd.date_range( '2014-02-01', periods=365 * 1, freq='D', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0, "estimated": False } for dt in record_starts ] meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": _natural_gas_input(records), "project": project_meter_input, } return meter_input
def meter_input_daily_with_period_start_end( project_meter_input_with_period_start_end): record_starts = pd.date_range( '2012-01-01', periods=365 * 4, freq='D', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0, "estimated": False } for dt in record_starts ] trace = _natural_gas_input(records) trace.update({'interval': 'daily'}) meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": trace, "project": project_meter_input_with_period_start_end, } return meter_input
def meter_input_strange_interpretation(project_meter_input): record_starts = pd.date_range( '2012-01-01', periods=365 * 4, freq='D', tz=pytz.UTC) records = [ { "start": dt.isoformat(), "value": 1.0, "estimated": False } for dt in record_starts ] meter_input = { "type": "SINGLE_TRACE_SIMPLE_PROJECT", "trace": { "type": "ARBITRARY_START", "interpretation": "ELECTRICITY_CONSUMPTION_NET", "unit": "therm", "records": records }, "project": project_meter_input } return meter_input
def trace4(): trace_length = 100 data = { "value": [1 for _ in range(trace_length)], "estimated": [False for _ in range(trace_length)] } columns = ["value", "estimated"] index = pd.date_range( start=datetime(2011, 1, 1, tzinfo=pytz.UTC), periods=trace_length, freq='D', tz=pytz.UTC ) df = pd.DataFrame(data, index=index, columns=columns) return EnergyTrace("ELECTRICITY_CONSUMPTION_SUPPLIED", df, unit="KWH")
def parse_raw(filepath,seconds=1): ''' ??filepath????????????? :param filepath: ??????????????? :param seconds: int?????????????? :return: dataframe??index??????columns?????? ''' data_head=pd.read_csv(filepath,delim_whitespace=True,header=None,nrows=1) data=pd.read_csv(filepath,delim_whitespace=True,header=None,skiprows=2) date_start=data_head.iloc[0,3] time_start=data.iloc[1,0]+' '+data.iloc[1,1] datetime_start=pd.to_datetime(date_start+' '+time_start) columns=list(data.iloc[0,2:]) newdata=data.iloc[1:,2:].applymap(convert2float) newdata=newdata.dropna(axis=0,how='any') newdata=newdata.loc[(newdata.applymap(type)==type('')).sum(axis=1)<newdata.shape[1]] newdata=newdata.applymap(convert2float) newdata.columns=columns newdata.index=pd.date_range(start=datetime_start,periods=newdata.shape[0],freq='%dS'%seconds) newdata.index.name='datetime' return newdata
def test_date_range_lower_freq(): cal = mcal.get_calendar("NYSE") schedule = cal.schedule(pd.Timestamp('2017-09-05 20:00', tz='UTC'), pd.Timestamp('2017-10-23 20:00', tz='UTC')) # cannot get date range of frequency lower than 1D with pytest.raises(ValueError): mcal.date_range(schedule, frequency='3D') # instead get for 1D and convert to lower frequency short = mcal.date_range(schedule, frequency='1D') actual = mcal.convert_freq(short, '3D') expected = pd.date_range('2017-09-05 20:00', '2017-10-23 20:00', freq='3D', tz='UTC') assert_index_equal(actual, expected) actual = mcal.convert_freq(short, '1W') expected = pd.date_range('2017-09-05 20:00', '2017-10-23 20:00', freq='1W', tz='UTC') assert_index_equal(actual, expected)
def get_periods_range(start_dt, end_dt, freq): """ Get a date range for the specified parameters. Parameters ---------- start_dt: datetime end_dt: datetime freq: str Returns ------- DateTimeIndex """ if freq == 'minute': freq = 'T' elif freq == 'daily': freq = 'D' return pd.date_range(start_dt, end_dt, freq=freq)
def test_contract_at_offset(self): contract_sids = array([1, 2, 3, 4], dtype=int64) start_dates = pd.date_range('2015-01-01', periods=4, tz="UTC") contracts = deque(self.asset_finder.retrieve_all(contract_sids)) oc = OrderedContracts('FO', contracts) self.assertEquals(1, oc.contract_at_offset(1, 0, start_dates[-1].value), "Offset of 0 should return provided sid") self.assertEquals(2, oc.contract_at_offset(1, 1, start_dates[-1].value), "Offset of 1 should return next sid in chain.") self.assertEquals(None, oc.contract_at_offset(4, 1, start_dates[-1].value), "Offset at end of chain should not crash.")
def test_next_event_indexer(self): events = self.events event_sids = events['sid'].values event_dates = events['event_date'].values event_timestamps = events['timestamp'].values all_dates = pd.date_range('2014', '2014-01-31') all_sids = np.unique(event_sids) indexer = next_event_indexer( all_dates, all_sids, event_dates, event_timestamps, event_sids, ) # Compute expected results without knowledge of null events. for i, sid in enumerate(all_sids): self.check_next_event_indexer( events, all_dates, sid, indexer[:, i], )
def force_start_end_data_to_dataframe(user, dataframe, start_date, end_date): assert type(dataframe) == pd.DataFrame # if dataframe contains any dates outside of start and end date ... exclude dataframe = dataframe[start_date:end_date].asfreq('D') index = pd.date_range(start=start_date, end=end_date, tz=user.pytz_timezone) # blank dataframe that we know for certain holds all the right dates dataframe_container = pd.DataFrame(index=index) # join the dataframe with an empty one that has all the right indices ... to return a dataframe with all the right # start and end dates normalized_dataframe = pd.DataFrame.join(dataframe_container, dataframe) # Pandas is like a fine edged sword, sometimes it cuts everything perfectly, other times you don't know it's # power and it claws at you and takes back the bamboo. For the record, problem is not the panda, but the trainer. assert dataframe_container.index.size == normalized_dataframe.index.size return normalized_dataframe
def _get_serialized_dataframe(self, supplement_name, boolean_string_name, values_to_create): data_values = [boolean_string_name] * values_to_create today = datetime.date.today() periods_ago = today - datetime.timedelta(days=values_to_create - 1) date_range = pd.date_range(periods_ago, today) # this would be stupid if the count is off self.assertEqual(len(data_values), len(date_range)) dataframe = pd.DataFrame(index=date_range) dataframe[supplement_name] = data_values # make sure there's no dynamic type conversion that can screw you series = dataframe[supplement_name] self.assertEqual(series[0], boolean_string_name) serialized_dataframe = ExcelSupplementFileSerializer._sanitize_dataframe_values(dataframe) return serialized_dataframe
def import_history(self, start_date, end_date): dataframe_columns = RESCUETIME_EFFICIENCY_HEADERS + [PRODUCTIVITY_PULSE] historical_df = pd.DataFrame(columns=dataframe_columns) query_dates = pd.date_range(start=start_date, end=end_date).date for query_date in query_dates: response = self._get_rescuetime_efficiency_for_date(query_date) if response.status_code != 200: continue efficiency_timeseries = self.get_efficiency_timeseries_from_response(response) pulse = calculate_rescue_time_pulse_from_dataframe(efficiency_timeseries) efficiency_timeseries[PRODUCTIVITY_PULSE] = pulse # Update the dataframe with history historical_df.loc[query_date] = efficiency_timeseries # when done, update into the results self.results = historical_df
def __init__(self, user, periods_back=30): self.user = user self.hour_series = range(0, 24) historical_data_points_quantity = periods_back end_date = timezone.now() # use pandas to generate a nifty index of timestamps, use timezone to remove warning signals self.date_series = pd.date_range(end=end_date, freq='D', periods=historical_data_points_quantity) # build a series that shows the impact of what supplements/events have on sleep self.sleep_impact_series = pd.Series(0, index=self.date_series) self.productivity_impact_series = pd.Series(0, index=self.date_series) self.sleep_series = self._get_random_sleep_series(self.date_series) # Create a cache here because creating many events is very slow on Production ... # so create a cache of commonly used Django objects and then create a bunch of events that # need this foreign key, so we can use bulk_create self.user_activities = {} self.supplements = {}
def create_timeseries(starting_date, ending_date, value=0): """Create a Pandas Time Series with constant values. Attributes ---------- starting_date: str, pandas.tslib.Timestamp The first date of the Time Series. ending_date: str, pandas.tslib.Timestamp The last date of the Time Series. value: int,float Value to add to new entries. Default is zero. """ timeseries_index = pd.date_range(starting_date, ending_date) timeseries = pd.Series(value, index=timeseries_index) return timeseries
def create_es(solver, timesteps, year): """ Creates a default energy system to load results into. """ simulation = es.Simulation(solver=solver, timesteps=timesteps, debug=False, objective_options={"function": minimize_cost}) # Adding a time index to the energy system time_index = pd.date_range('1/1/' + year, periods=len(timesteps), freq='H') energysystem = es.EnergySystem(time_idx=time_index, simulation=simulation) return energysystem
def _hourly_range(self, init_date, time_frame): """ Returns DatetimeIndex trading week/s in hours. """ utcnow = datetime.utcnow() tr_wk_str, tr_wk_end = self.get_trading_week(init_date) if tr_wk_end > utcnow: tr_wk_end = utcnow.replace( minute=00,second=00, microsecond=00) freq, interval_type, delta = self._data_frequency(time_frame) dth = pd.date_range(str(tr_wk_str), str(tr_wk_end), freq=freq) while (len(dth) % (300*int(time_frame[1:])) == 0) == False: tr_wk_str = tr_wk_end + timedelta(**{interval_type: delta}) if tr_wk_str < utcnow: tr_wk_str, tr_wk_end = self.get_trading_week(tr_wk_str) if tr_wk_end > utcnow: tr_wk_end = utcnow.replace( minute=00,second=00, microsecond=00) tr_wk_end += timedelta(hours=1) dth = dth.append( pd.date_range(str(tr_wk_str), str(tr_wk_end), freq=freq)) else: break return dth
def _daily_range(self, daily): """ Returns DatetimeIndex for daily values. """ max_bars = 299 utcnow = datetime.utcnow() dtd = pd.DatetimeIndex([]) while daily < utcnow: tr_wk_str, tr_wk_end = self.get_trading_week(daily) hour = int(str(tr_wk_str.time())[:2]) daily += timedelta(days=1) daily = daily.replace(hour=hour) if daily >= tr_wk_end: daily, tr_wk_end = self.get_trading_week(daily) dtd = dtd.append( pd.date_range(str(daily), str(daily))) return dtd
def _monthly_range(self, last_day_of_month): """ Returns DatetimeIndex for monthly values. """ ldom = last_day_of_month max_bars = 299 utcnow = datetime.utcnow() dtm = pd.DatetimeIndex([]) while ldom < utcnow: dtm = dtm.append(pd.date_range( str(ldom), str(ldom))) if ldom.month == 12: ldom = ldom.replace(year=ldom.year+1, month=2, day=1) elif ldom.month == 11: ldom = ldom.replace(year=ldom.year+1, month=1, day=1) else: ldom = ldom.replace(month=ldom.month+2, day=1) ldom -= timedelta(days=1) ldom = ldom.replace(hour=self.new_york_offset(ldom, 22)) return dtm
def fill_in_missing_dates(df, date_col_name, other_col): startd = df[date_col_name].values[0] endd = df[date_col_name].values[-1] print startd, endd idx = pd.date_range(startd, endd) dict = {} for index, row in df.iterrows(): dict[row[date_col_name]] = row[other_col] new_data = [] for d in idx: pydate = d.to_pydatetime() daskey = pydate.strftime('%Y-%m-%d') new_data.append([daskey, dict[daskey] if dict.has_key(daskey) else None]) return np.row_stack(new_data)
def fill_in_missing_dates(df, date_col_name, other_col): startd = df[date_col_name].values[0] endd = df[date_col_name].values[-1] print startd, endd idx = pd.date_range(startd, endd) dict = {} for index, row in df.iterrows(): dict[row[date_col_name]] = row[other_col] new_data = [] for d in idx: pydate = d.to_pydatetime() daskey = pydate.strftime('%Y-%m-%d') new_data.append([daskey, dict[daskey] if dict.has_key(daskey) else 0]) return np.row_stack(new_data)
def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 for i in range(1, 367): subset = ts[doy == i] subset.index = [x.year for x in subset.index] result = annual[i].dropna() tm.assert_series_equal(result, subset, check_names=False) self.assertEqual(result.name, i) # check leap days leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] day = leaps.index.dayofyear[0] leaps.index = leaps.index.year leaps.name = 60 tm.assert_series_equal(annual[day].dropna(), leaps)
def market_minutes_for_day(self, stamp): market_open, market_close = self.get_open_and_close(stamp) return pd.date_range(market_open, market_close, freq='T')
def get_trading_days(start, end, trading_day=trading_day): return pd.date_range(start=start.date(), end=end.date(), freq=trading_day).tz_localize('UTC')
def gen_calendars(start, stop, critical_dates): """ Generate calendars to use as inputs. """ all_dates = pd.date_range(start, stop, tz='utc') for to_drop in map(list, powerset(critical_dates)): # Have to yield tuples. yield (all_dates.drop(to_drop),) # Also test with the trading calendar. yield (trading_days[trading_days.slice_indexer(start, stop)],)
def test_basics(self, window=10): items = ['bar', 'baz', 'foo'] minor = ['A', 'B', 'C', 'D'] rp = MutableIndexRollingPanel(window, items, minor, cap_multiple=2) dates = pd.date_range('2000-01-01', periods=30, tz='utc') major_deque = deque(maxlen=window) frames = {} for i, date in enumerate(dates): frame = pd.DataFrame(np.random.randn(3, 4), index=items, columns=minor) rp.add_frame(date, frame) frames[date] = frame major_deque.append(date) result = rp.get_current() expected = pd.Panel(frames, items=list(major_deque), major_axis=items, minor_axis=minor) tm.assert_panel_equal(result, expected.swapaxes(0, 1))
def setUpClass(cls): cls.dates = dates = pd.date_range('2014-01-01', '2014-01-03') dates = cls.dates.repeat(3) cls.sids = sids = ord('A'), ord('B'), ord('C') cls.df = df = pd.DataFrame({ 'sid': sids * 3, 'value': (0., 1., 2., 1., 2., 3., 2., 3., 4.), 'int_value': (0, 1, 2, 1, 2, 3, 2, 3, 4), 'asof_date': dates, 'timestamp': dates, }) cls.dshape = dshape(""" var * { sid: ?int64, value: ?float64, int_value: ?int64, asof_date: datetime, timestamp: datetime } """) cls.macro_df = df[df.sid == 65].drop('sid', axis=1) dshape_ = OrderedDict(cls.dshape.measure.fields) del dshape_['sid'] cls.macro_dshape = var * Record(dshape_) cls.garbage_loader = BlazeLoader() cls.missing_values = {'int_value': 0}