我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.isnull()。
def read_data(fname): """ Read football-data.co.uk csv """ data = ( pd.read_csv(fname) .rename(columns={ 'HomeTeam': 'home_team', 'AwayTeam': 'away_team', 'FTHG': 'home_goals', 'FTAG': 'away_goals' }) .loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games ) team_map = stan_map(pd.concat([data['home_team'], data['away_team']])) data['home_team_id'] = data['home_team'].replace(team_map) data['away_team_id'] = data['away_team'].replace(team_map) for col in ('home_goals', 'away_goals'): data[col] = [int(c) for c in data[col]] return data, team_map
def get_resolution(pdb_id): """Quick way to get the resolution of a PDB ID using the table of results from the REST service Returns infinity if the resolution is not available. Returns: float: resolution of a PDB ID in Angstroms TODO: - Unit test """ pdb_id = pdb_id.upper() if pdb_id not in _property_table().index: raise ValueError('PDB ID not in property table') else: resolution = _property_table().ix[pdb_id, 'resolution'] if pd.isnull(resolution): log.debug('{}: no resolution available, probably not an X-ray crystal structure') resolution = float('inf') return resolution
def get_release_date(pdb_id): """Quick way to get the release date of a PDB ID using the table of results from the REST service Returns None if the release date is not available. Returns: str: Organism of a PDB ID """ pdb_id = pdb_id.upper() if pdb_id not in _property_table().index: raise ValueError('PDB ID not in property table') else: release_date = _property_table().ix[pdb_id, 'releaseDate'] if pd.isnull(release_date): log.debug('{}: no taxonomy available') release_date = None return release_date
def do_pharm_prod(drug_qid, brand_rxnorm, emea, url, brand_name): # write info on the pharmaceutical product page ref = create_ref_statement(emea, url) # has active substance s = [wdi_core.WDItemID(drug_qid, 'P3781', references=[ref])] # instance of s.append(wdi_core.WDItemID('Q28885102', 'P31', references=[ref])) # pharmaceutical product s.append(wdi_core.WDItemID('Q169336', 'P31', references=[ref])) # chemical mixture # emea s.append(wdi_core.WDExternalID(emea, 'P3637', references=[ref])) if not pd.isnull(brand_rxnorm): s.append(wdi_core.WDExternalID(str(int(brand_rxnorm)), "P3345")) item = wdi_core.WDItemEngine(item_name=brand_name, data=s, domain="drugs", append_value=['P3781']) item.set_label(brand_name) if item.get_description() == '': item.set_description("pharmaceutical product") wdi_helpers.try_write(item, emea, 'P3637', login, edit_summary="add 'active ingredient'") return item.wd_item_id
def get_wikidata_do_mesh(): # get mesh xrefs, and including mapping relation type # {'DOID:0050856': {'skos:broadMatch_D019958'}} query = """ select ?item ?doid ?mesh ?mesh_rt where { ?item wdt:P699 ?doid . ?item p:P486 ?mesh_s . ?mesh_s ps:P486 ?mesh . optional { ?mesh_s pq:P4390 ?mesh_rt } }""" results = WDItemEngine.execute_sparql_query(query)['results']['bindings'] results = [{k: v['value'].replace("http://www.wikidata.org/entity/", "") for k, v in item.items()} for item in results] df = pd.DataFrame(results) df['mesh_rt'] = df.apply(lambda row: QID_MAP_REL_TYPE_CURIE[row.mesh_rt] + "_MESH:" + row.mesh, axis=1) df['_item'] = df['item'] r = df.groupby("_item").aggregate(lambda x: set(y for y in x if not pd.isnull(y))).to_dict("records") wd = {list(x['doid'])[0]: x for x in r} wd = {k: v['mesh_rt'] for k, v in wd.items()} wd = {k: v for k, v in wd.items() if v} return wd
def correct_p1c1(rinex_dump, replace_p1_with_c1=True): """ """ if rinex_dump.recv_p1c1 not in [1, 2, 3]: raise ValueError('unknown receiver type {} (must be 1, 2, or 3)'.format(rinex_dump.recv_p1c1)) for sat in sorted(set(rinex_dump.sat)): b = rinex_dump.p1c1_table[sat] if rinex_dump.recv_p1c1 == 1: rinex_dump.loc[rinex_dump.sat == sat, 'C1'] += b rinex_dump.loc[rinex_dump.sat == sat, 'P2'] += b elif rinex_dump.recv_p1c1 == 2: rinex_dump.loc[rinex_dump.sat == sat, 'C1'] += b if replace_p1_with_c1: I = PD.isnull(rinex_dump['P1']) rinex_dump.loc[I, 'P1'] = rinex_dump.loc[I, 'C1'] return rinex_dump
def to_ns(x): """Convert input timestamps to nanoseconds (integers) :param x: value to be converted :returns: converted value :rtype: int """ if pd.isnull(x): return 0 try: return pd.to_datetime(x).value except: if hasattr(x, '__str__'): return pd.to_datetime(str(x)).value return 0
def check_nan(val): """Check input value for not a number :param val: value to be checked for nan :returns: true if nan :rtype: bool """ if pd.isnull(val): return True if isinstance(val, str): val = val.strip() if not val or val.lower() == 'none' or val.lower() == 'nan': return True #from numpy import datetime64 # if isinstance(val, datetime64): # return val == datetime64('NaT') return False
def to_str(val, **kwargs): """Convert input to string :param val: value to be converted :returns: converted value :rtype: str """ try: if pd.isnull(val): return kwargs['nan'] except BaseException: pass if isinstance(val, str): return val if kwargs.get('convert_inconsistent_dtypes', True): if hasattr(val, '__str__'): return str(val) return kwargs['nan']
def to_int(val, **kwargs): """Convert input to int :param val: value to be evaluated :returns: evaluated value :rtype: np.int64 """ try: if pd.isnull(val): return kwargs['nan'] except BaseException: pass if isinstance(val, np.int64) or isinstance(val, int): return np.int64(val) if kwargs.get('convert_inconsistent_dtypes', True): try: return np.int64(val) except BaseException: pass return kwargs['nan']
def bool_to_str(val, **kwargs): """Convert input boolean to str :param val: value to be evaluated :returns: evaluated value :rtype: str """ try: if pd.isnull(val): return kwargs['nan'] except BaseException: pass if isinstance(val, np.bool_) or isinstance(val, bool): return str(val) if kwargs.get('convert_inconsistent_dtypes', True): if hasattr(val, '__str__'): return str(val) return kwargs['nan']
def bool_to_int(val): """Convert input boolean to int :param val: value to be evaluated :returns: evaluated value :rtype: np.int64 """ try: if pd.isnull(val): return kwargs['nan'] except BaseException: pass if isinstance(val, np.bool_) or isinstance(val, bool): return np.int64(val) if kwargs.get('convert_inconsistent_dtypes', False): try: return np.int64(val) except BaseException: pass return kwargs['nan']
def helper_impute_result_check(self, data, result): """ check if the imputed reuslt valid now, check for: 1. contains no nan anymore 2. orignal non-nan value should remain the same """ # check 1 self.assertEqual(pd.isnull(result).sum().sum(), 0) # check 2 # the original non-missing values must keep unchanged # to check, cannot use pd equals, since the imputer may convert: # 1 -> 1.0 # have to do loop checking missing_value_mask = pd.isnull(data) for col_name in data: data_non_missing = data[~missing_value_mask[col_name]][col_name] result_non_missing = result[~missing_value_mask[col_name]][col_name] for i in data_non_missing.index: self.assertEqual(data_non_missing[i]==result_non_missing[i], True, msg="not equals in column: {}".format(col_name))
def limits(self): if self.is_empty(): return (0, 1) # Fall back to the range if the limits # are not set or if any is None or NaN if self._limits is not None and self.range.range is not None: limits = [] if len(self._limits) == len(self.range.range): for l, r in zip(self._limits, self.range.range): value = r if pd.isnull(l) else l limits.append(value) else: limits = self._limits return tuple(limits) return self.range.range
def map(self, x, limits=None): """ Return an array-like of x mapped to values from the scales palette """ if limits is None: limits = self.limits n = sum(~pd.isnull(list(limits))) pal = self.palette(n) if isinstance(pal, dict): # manual palette with specific assignments pal_match = [pal[val] for val in x] else: pal = np.asarray(pal) pal_match = pal[match(x, limits)] pal_match[pd.isnull(pal_match)] = self.na_value return pal_match
def _mode(x, def_fill=ImputerMixin._def_fill): """Get the most common value in a 1d H2OFrame. Ties will be handled in a non-specified manner. Parameters ---------- x : ``H2OFrame``, shape=(n_samples, 1) The 1d frame from which to derive the mode """ idx = x.as_data_frame(use_pandas=True)[x.columns[0]].value_counts().index # if the most common is null, then return the next most common. # if there is no next common (i.e., 100% null) then we return the def_fill return idx[0] if not pd.isnull(idx[0]) else idx[1] if idx.shape[0] > 1 else def_fill
def get_loctype(location, date_index): """Returns a pandas Series of the location type for each day. Locations with a changetime have type *city* before that day, and *conflict* after it. """ n_days = len(date_index) changetime = location.time if pd.isnull(changetime): loctype = location.location_type else: #0:changetime, loctype = "city" loctype = ['city'] * int(changetime) #changetime:-1, loctype = "conflict" loctype +=['conflict'] * int(n_days - changetime) return pd.Series(loctype, index=date_index)
def compare_except(s1, s2, exceptions=[]): conc = pd.concat([s1, s2], axis=1, ignore_index=True) def except_apply(x): try: str1 = x[0] str2 = x[1] for ex in exceptions: str1 = str1.replace(ex, "") return jellyfish.jaro_distance(str1, str2) except Exception as err: if pd.isnull(x[0]) or pd.isnull(x[1]): return np.nan else: raise err return conc.apply(except_apply, axis=1)
def find_null_columns(df, features): """Locates columns in a pandas dataframe that have no values. Args: df: A pandas dataframe containing data. wanted_feats: A list of string names of columns storing the actual data. Returns: A list of string names of the null columns. """ df_len = len(df) bad_feats = [] for feat in features: null_len = len(df[df[feat].isnull()]) if df_len == null_len: bad_feats.append(feat) return bad_feats
def _merge_query_params(self, params, date=None): ret = '' for key, value in params.iteritems(): if key == 'tenor' and pd.isnull(value): ret += 'tradeDate=' + date + ';' elif not pd.isnull(value): if key == Header.TENOR: py_assert(date is not None, ValueError, 'date must be given if tenor is not None') # unit = ''.join(re.findall('[0-9]+', params[Header.TENOR])) # freq = FreqType(params[Header.TENOR][len(unit):]) ret += 'startDate=' + WIND_DATA_PROVIDER.forward_date(date, value, self.date_format) + ';endDate=' + date + ';' elif key == Header.FREQ and value[:3] == 'min': ret += ('BarSize=' + value[3:] + ';') else: ret += (key + '=' + str(value) + ';') ret = ret[:-1] + FactorLoader._check_industry_params(params.name) return ret
def _complement_bases(self, genotype): if pd.isnull(genotype): return np.nan complement = '' for base in list(genotype): if base == 'A': complement += 'T' elif base == 'G': complement += 'C' elif base == 'C': complement += 'G' elif base == 'T': complement += 'A' return complement
def cleanNullColumns(sheet): """ Helper function to discard columns in sheets where each value in column is null. Accepts a DataFrame as the sheet argument. Returns the cleaned dataframe or an error Tuple of (False, error) """ try:# check for and remove columns with all NaNs for column in sheet.columns: if pd.isnull(sheet[column]).all(): sheet.drop(column, axis=1, inplace=True) return sheet except Exception as e: return False, e
def get_isd_data(self, station, year): filename_format = '/pub/data/noaa/{year}/{station}-{year}.gz' lines = self._retreive_file_lines(filename_format, station, year) dates = pd.date_range("{}-01-01 00:00".format(year), "{}-12-31 23:00".format(int(year) + 1), freq='H', tz=pytz.UTC) series = pd.Series(None, index=dates, dtype=float) for line in lines: if line[87:92].decode('utf-8') == "+9999": temp_C = float("nan") else: temp_C = float(line[87:92]) / 10. date_str = line[15:27].decode('utf-8') # there can be multiple readings per hour, so set all to minute 0 dt = pytz.UTC.localize(datetime.strptime(date_str, "%Y%m%d%H%M")).replace(minute=0) # only set the temp if it's the first encountered in the hour. if pd.isnull(series.ix[dt]): series[dt] = temp_C return series
def get_input_data_mask(self, input_data): ''' Boolean list of missing/not missing values: True => missing False => not missing ''' trace_data, temp_data = input_data dts = [] mask = [] if trace_data.empty or temp_data.empty: return pd.Series(mask) for (start, energy), (p, group) in zip( trace_data.iteritems(), temp_data.groupby(level="period")): temps = group.copy() temps.index = temps.index.droplevel() daily_temps = temps.resample('D').apply(np.mean)[0] for i, tempF in daily_temps.iteritems(): dts.append(i) mask.append(pd.isnull(energy) or pd.isnull(tempF)) return pd.Series(mask, index=dts)
def test_multiple_records_with_gap(serializer): records = [ { "start": datetime(2000, 1, 1, tzinfo=pytz.UTC), "end": datetime(2000, 1, 2, tzinfo=pytz.UTC), "value": 1, }, { "start": datetime(2000, 1, 3, tzinfo=pytz.UTC), "end": datetime(2000, 1, 4, tzinfo=pytz.UTC), "value": 2, }, ] df = serializer.to_dataframe(records) assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 1 assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)] assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)]) assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)] assert df.value[datetime(2000, 1, 3, tzinfo=pytz.UTC)] == 2 assert not df.estimated[datetime(2000, 1, 3, tzinfo=pytz.UTC)] assert pd.isnull(df.value[datetime(2000, 1, 4, tzinfo=pytz.UTC)]) assert not df.estimated[datetime(2000, 1, 4, tzinfo=pytz.UTC)]
def test_multiple_records(serializer): records = [ { "start": datetime(2000, 1, 1, tzinfo=pytz.UTC), "value": 1, }, { "start": datetime(2000, 1, 2, tzinfo=pytz.UTC), "value": 2, }, ] df = serializer.to_dataframe(records) assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 1 assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)] assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)]) assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)]
def test_multiple_records(serializer): records = [ { "end": datetime(2000, 1, 1, tzinfo=pytz.UTC), "value": 1, }, { "end": datetime(2000, 1, 2, tzinfo=pytz.UTC), "value": 2, }, ] df = serializer.to_dataframe(records) assert df.value[datetime(2000, 1, 1, tzinfo=pytz.UTC)] == 2 assert not df.estimated[datetime(2000, 1, 1, tzinfo=pytz.UTC)] assert pd.isnull(df.value[datetime(2000, 1, 2, tzinfo=pytz.UTC)]) assert not df.estimated[datetime(2000, 1, 2, tzinfo=pytz.UTC)]
def test_to_records(serializer): data = {"value": [1, np.nan], "estimated": [True, False]} columns = ["value", "estimated"] index = pd.date_range('2000-01-01', periods=2, freq='D') df = pd.DataFrame(data, index=index, columns=columns) records = serializer.to_records(df) assert len(records) == 2 assert records[0]["end"] == datetime(2000, 1, 1, tzinfo=pytz.UTC) assert pd.isnull(records[0]["value"]) assert not records[0]["estimated"] assert records[1]["end"] == datetime(2000, 1, 2, tzinfo=pytz.UTC) assert records[1]["value"] == 1 assert records[1]["estimated"]
def test_get_last_traded_equity_minute(self): trading_calendar = self.trading_calendars[Equity] # Case: Missing data at front of data set, and request dt is before # first value. dts = trading_calendar.minutes_for_session(self.trading_days[0]) asset = self.asset_finder.retrieve_asset(1) self.assertTrue(pd.isnull( self.data_portal.get_last_traded_dt( asset, dts[0], 'minute'))) # Case: Data on requested dt. dts = trading_calendar.minutes_for_session(self.trading_days[2]) self.assertEqual(dts[1], self.data_portal.get_last_traded_dt( asset, dts[1], 'minute')) # Case: No data on dt, but data occuring before dt. self.assertEqual(dts[4], self.data_portal.get_last_traded_dt( asset, dts[5], 'minute'))
def test_get_last_traded_future_minute(self): asset = self.asset_finder.retrieve_asset(10000) trading_calendar = self.trading_calendars[Future] # Case: Missing data at front of data set, and request dt is before # first value. dts = trading_calendar.minutes_for_session(self.trading_days[0]) self.assertTrue(pd.isnull( self.data_portal.get_last_traded_dt( asset, dts[0], 'minute'))) # Case: Data on requested dt. dts = trading_calendar.minutes_for_session(self.trading_days[3]) self.assertEqual(dts[1], self.data_portal.get_last_traded_dt( asset, dts[1], 'minute')) # Case: No data on dt, but data occuring before dt. self.assertEqual(dts[4], self.data_portal.get_last_traded_dt( asset, dts[5], 'minute'))
def sendData(con, df): cursor = con.cursor() cols = df.columns.tolist() values = df.values for vals in values: for i,val in enumerate(vals): if pd.isnull(val): vals[i]=None query = 'INSERT INTO {} ({}) VALUES ({})'.format( SEND_TABLE, ','.join(['"{}"'.format(x) for x in cols]), ','.join(['%s']*len(cols))) cursor.execute(query, tuple(vals)) con.commit() cursor.close()
def __convert_survey_to_sequence(self): s = self.__beamline if 'LENGTH' not in s: s['LENGTH'] = np.nan offset = s['ORBIT_LENGTH'][0] / 2.0 if pd.isnull(offset): offset = 0 self.__beamline['AT_CENTER'] = pd.DataFrame( npl.norm( [ s['X'].diff().fillna(0.0), s['Y'].diff().fillna(0.0) ], axis=0 ) - ( s['LENGTH'].fillna(0.0) / 2.0 - s['ORBIT_LENGTH'].fillna(0.0) / 2.0 ) + ( s['LENGTH'].shift(1).fillna(0.0) / 2.0 - s['ORBIT_LENGTH'].shift(1).fillna(0.0) / 2.0 )).cumsum() / 1000.0 + offset self.__converted_from_survey = True
def split_rbends(line, n=20): split_line = pd.DataFrame() for index, row in line.iterrows(): if row['CLASS'] == 'RBEND' and pd.isnull(row.get('SPLIT')): angle = row['ANGLE'] / n length = row['L'] / n for i in range(0,n): row = row.copy() row.name = index + "_{}".format(i) row['SPLIT'] = True row['ANGLE'] = angle row['L'] = length split_line = split_line.append(row) else: split_line = split_line.append(row) split_line[['THICK']] = split_line[['THICK']].applymap(bool) return split_line
def element_to_mad(e): """Convert a pandas.Series representation onto a MAD-X sequence element.""" if e.CLASS not in SUPPORTED_CLASSES: return "" mad = "{}: {}, ".format(e.name, e.CLASS) if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']): mad += f"ANGLE={e['BENDING_ANGLE']}," elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']): mad += f"ANGLE={e.get('ANGLE', 0)}," else: # Angle property not supported by the element or absent mad += "" mad += ', '.join(["{}={}".format(p, e[p]) for p in SUPPORTED_PROPERTIES if pd.notnull(e.get(p, None))]) if pd.notnull(e['LENGTH']) and e['LENGTH'] != 0.0: mad += ", L={}".format(e['LENGTH']) if pd.notnull(e.get('APERTYPE', None)): mad += ", APERTURE={}".format(str(e['APERTURE']).strip('[]')) if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')) and pd.isnull(e.get('VALUE')): mad += ", {}:={}".format(e['PLUG'], e['CIRCUIT']) if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('VALUE')): mad += ", {}={}".format(e['PLUG'], e['VALUE']) mad += ", AT={}".format(e['AT_CENTER']) mad += ";" return mad
def _validate_pandas_index(index, label): # `/` and `\0` aren't permitted because they are invalid filename # characters on *nix filesystems. The remaining values aren't permitted # because they *could* be misinterpreted by a shell (e.g. `*`, `|`). illegal_chars = ['/', '\0', '\\', '*', '<', '>', '?', '|', '$'] chars_for_msg = ", ".join("%r" % i for i in illegal_chars) illegal_chars = set(illegal_chars) # First check the index dtype and ensure there are no null values if index.dtype_str not in ['object', 'str'] or pd.isnull(index).any(): msg = "Non-string Metadata %s values detected" % label raise ValueError(invalid_metadata_template % msg) # Then check for invalid characters along index for value in index: if not value or illegal_chars & set(value): msg = "Invalid characters (e.g. %s) or empty ID detected in " \ "metadata %s: %r" % (chars_for_msg, label, value) raise ValueError(invalid_metadata_template % msg) # Finally, ensure unique values along index if len(index) != len(set(index)): msg = "Duplicate Metadata %s values detected" % label raise ValueError(invalid_metadata_template % msg)
def isnull(value): """ Return true if values is NaN or None. >>> import numpy as np >>> ReadPandas.isnull(np.NaN) True >>> ReadPandas.isnull(None) True >>> ReadPandas.isnull(0) False :param value: Value to test :return: Return true for NaN or None values. :rtype: bool """ return pd.isnull(value)
def clean_data(self): # load qualif and race data df_qual = self.load_qualif_data() df_races = self.load_results_data() # remove Japan as no data for 2015 race df_qual = self.del_japan15(df_qual) df_races = self.del_japan15(df_races) # create unique id df_qual = self.unique_id(df_qual) df_races = self.unique_id(df_races) # merge the results df_out = df_races.merge( df_qual, on='id_', how='inner', suffixes=('', '_qual')) df_out = df_out[pd.isnull(df_out.q_min) == False] print df_out.shape return df_out.reset_index(drop=1), df_races.reset_index(drop=1), df_qual.reset_index(drop=1) # load the data
def Xy_matrix(df_qual_and_race, columns, df_wet): df_q_r_out = df_qual_and_race.loc[:, columns].reset_index(drop=1) df_q_r_out = df_q_r_out[(pd.isnull( df_q_r_out[y_label]) == False) & (pd.isnull(df_q_r_out.q_min) == False)].reset_index(drop=1) X = df_q_r_out.loc[:, ['q_min', 'position_qual', 'raceId', 'circuitId', 'driverId', 'year', 'round', 'dob', y_label]] # birth year / mo X['birth_year'] = map(lambda x: int(x.year), df_q_r_out['dob']) X['birth_mo'] = map(lambda x: int(x.month), df_q_r_out['dob']) X.drop('dob', axis=1, inplace=1) # adding wet as a feature # weather data df_races = d['races'].copy() # df_races.head() X = X.merge(df_wet.drop(['circuitId'], 1), how='left', on=['year', 'round']) # pit stop df_pits = d['pitStops'].groupby(['raceId', 'driverId'], as_index=0)[ 'milliseconds'].sum() df_pits.reset_index(drop=1, inplace=1) X_y = X.merge(df_pits, how='left', on=['raceId', 'driverId']) X_y.fillna(0, inplace=1) return X_y
def differences(self, name, values, ref_values, precision): """ Returns a short summary of where values differ, for two columns. """ for i, val in enumerate(values): refval = ref_values[i] if val != refval and not (pd.isnull(val) and pd.isnull(refval)): stop = self.ndifferences(values, ref_values, i) summary_vals = self.sample_format(values, i, stop, precision) summary_ref_vals = self.sample_format(ref_values, i, stop, precision) return 'From row %d: [%s] != [%s]' % (i+1, summary_vals, summary_ref_vals) if values.dtype != ref_values.dtype: return 'Different types' else: return 'But mysteriously appear to be identical!'
def pandas_tdda_type(x): dt = getattr(x, 'dtype', None) if type(x) == str or dt == np.dtype('O'): return 'string' dts = str(dt) if type(x) == bool or 'bool' in dts: return 'bool' if type(x) in (int, long) or 'int' in dts: return 'int' if type(x) == float or 'float' in dts: return 'real' if (type(x) == datetime.datetime or 'datetime' in dts or type(x) == pandas_Timestamp): return 'date' if x is None or (not isinstance(x, pd.core.series.Series) and pd.isnull(x)): return 'null' # Everything else is other, for now, including compound types, # unicode in Python2, bytes in Python3 etc. return 'other'
def _predict(self, treenode, X): """ predict a single sample note that X is a tupe(index,pandas.core.series.Series) from df.iterrows() """ if treenode.is_leaf: return treenode.leaf_score elif pd.isnull(X[1][treenode.feature]): if treenode.nan_direction == 0: return self._predict(treenode.left_child, X) else: return self._predict(treenode.right_child, X) elif X[1][treenode.feature] < treenode.threshold: return self._predict(treenode.left_child, X) else: return self._predict(treenode.right_child, X)
def ffill_buffer_from_prior_values(freq, field, buffer_frame, digest_frame, pv_frame, raw=False): """ Forward-fill a buffer frame, falling back to the end-of-period values of a digest frame if the buffer frame has leading NaNs. """ # convert to ndarray if necessary digest_values = digest_frame if raw and isinstance(digest_frame, pd.DataFrame): digest_values = digest_frame.values buffer_values = buffer_frame if raw and isinstance(buffer_frame, pd.DataFrame): buffer_values = buffer_frame.values nan_sids = pd.isnull(buffer_values[0]) if np.any(nan_sids) and len(digest_values): # If we have any leading nans in the buffer and we have a non-empty # digest frame, use the oldest digest values as the initial buffer # values. buffer_values[0, nan_sids] = digest_values[-1, nan_sids] nan_sids = pd.isnull(buffer_values[0]) if np.any(nan_sids): # If we still have leading nans, fall back to the last known values # from before the digest. key_loc = pv_frame.index.get_loc((freq.freq_str, field)) filler = pv_frame.values[key_loc, nan_sids] buffer_values[0, nan_sids] = filler if raw: filled = ffill(buffer_values) return filled return buffer_frame.ffill()
def ffill_digest_frame_from_prior_values(freq, field, digest_frame, pv_frame, raw=False): """ Forward-fill a digest frame, falling back to the last known prior values if necessary. """ # convert to ndarray if necessary values = digest_frame if raw and isinstance(digest_frame, pd.DataFrame): values = digest_frame.values nan_sids = pd.isnull(values[0]) if np.any(nan_sids): # If we have any leading nans in the frame, use values from pv_frame to # seed values for those sids. key_loc = pv_frame.index.get_loc((freq.freq_str, field)) filler = pv_frame.values[key_loc, nan_sids] values[0, nan_sids] = filler if raw: filled = ffill(values) return filled return digest_frame.ffill()
def combine_water_heights(in_data): ''' Combine median and average water heights Create a column of water heights in input data frame using Median Water Depth by default, but fills in missing data using average values @param in_data: Input water heights data ''' if 'Mean Water Depth' in in_data.columns and 'Median Water Depth' in in_data.columns: # replacing all null median data with mean data median_null_index = pd.isnull(in_data.loc[:,'Median Water Depth']) in_data.loc[:,'Combined Water Depth'] = in_data.loc[:,'Median Water Depth'] # Check if there is any replacement data available if (~pd.isnull(in_data.loc[median_null_index, 'Mean Water Depth'])).sum() > 0: in_data.loc[median_null_index, 'Combined Water Depth'] = in_data.loc[median_null_index, 'Mean Water Depth'] elif 'Mean Water Depth' in in_data.columns and 'Median Water Depth' not in in_data.columns: in_data.loc[:,'Combined Water Depth'] = in_data.loc[:,'Mean Water Depth'] elif 'Mean Water Depth' not in in_data.columns and 'Median Water Depth' in in_data.columns: in_data.loc[:,'Combined Water Depth'] = in_data.loc[:,'Median Water Depth'] else: raise ValueError("in_data needs either 'Mean Water Depth' or 'Median Water Depth' or both")
def CONV(self, param): df = pd.DataFrame(index = param[0].index) df['X'] = param[0] df['W'] = param[1] class Convolution: def __init__(self, N): self.N = N self.q = deque([], self.N) self.tq = deque([], self.N) self.s = 0 self.t = 0 def handleInput(self, row): if len(self.q) < self.N: if pd.isnull(row['W']) or pd.isnull(row['X']): return np.NaN self.q.append(row['W'] * row['X']) self.tq.append(row['W']) self.s += row['W'] * row['X'] self.t += row['W'] return np.NaN ret = self.s / self.t self.s -= self.q[0] self.t -= self.tq[0] delta_s = row['W'] * row['X'] delta_t = row['W'] self.s += delta_s self.t += delta_t self.q.append(delta_s) self.tq.append(delta_t) return ret conv = Convolution(param[2]) result = df.apply(conv.handleInput, axis = 1, reduce = True) return result #??????
def build_strain_specific_models(self, save_models=False): """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous genes exist. Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') # Create an emptied copy of the reference GEM-PRO for strain_gempro in tqdm(self.strains): log.debug('{}: building strain specific model'.format(strain_gempro.id)) # For each genome, load the metabolic model or genes from the reference GEM-PRO logging.disable(logging.WARNING) if self._empty_reference_gempro.model: strain_gempro.load_cobra_model(self._empty_reference_gempro.model) elif self._empty_reference_gempro.genes: strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes] logging.disable(logging.NOTSET) # Get a list of genes which do not have orthology in the strain not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist() # Mark genes non-functional self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain) # Load sequences into the base and strain models self._load_strain_sequences(strain_gempro=strain_gempro) if save_models: cobra.io.save_json_model(model=strain_gempro.model, filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id))) strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id))) log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
def __ApplyOHE(cls, data, d_feat): """""" n = len(data) result = np.zeros((n, len(d_feat)), dtype='int8') ## d_stat = {} for i in range(n): for col in cls.CategoryCols: v = data.ix[i, col] if(col not in d_stat): d_stat[col] = {} if(pd.isnull(v)): result[i, d_feat['%s:missing' % col]] = 1 if('missing' in d_stat[col]): d_stat[col]['missing'] += 1 else: d_stat[col]['missing'] = 1 elif('%s:%s' % (col, v) in d_feat): result[i, d_feat['%s:%s' % (col, v)]] = 1 if('hit' in d_stat[col]): d_stat[col]['hit'] += 1 else: d_stat[col]['hit'] = 1 else: result[i, d_feat['%s:less' % col]] = 1 if('less' in d_stat[col]): d_stat[col]['less'] += 1 else: d_stat[col]['less'] = 1 ## check for col in d_stat: if(np.sum(list(d_stat[col].values())) != n): print('Encoding for column %s error, %d : %d. ' % (col, np.sum(list(d_stat[col].values())),n)) return result