我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.merge()。
def __init__(self, up_basket, up_r_basket = None, up_his = None): if (up_r_basket is not None) and (up_his is not None): self.is_reordered_included = True else: self.is_reordered_included = False up_basket['num_baskets'] = up_basket.basket.apply(len) self.user_id = list(up_basket.user_id) self.num_baskets = [int(n) for n in list(up_basket.num_baskets)] self.basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.basket)] if self.is_reordered_included is True: up_basket = pd.merge(up_basket, up_r_basket, on = ['user_id'], how = 'left') up_basket = pd.merge(up_basket, up_his, on = ['user_id'], how = 'left') self.reorder_basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.reorder_basket)] self.history_item = [[[int(p) for p in b]for b in u] for u in list(up_basket.history_items)]
def insert_actor_ids(commit_frame, actor_frame, drop_name_email=True): actor_columns = ['author_name', 'author_email', 'committer_name', 'committer_email'] cf = commit_frame[actor_columns] af = actor_frame[['name', 'email', 'actor_id']] author = pd.merge( cf, af, left_on=actor_columns[:2], right_on=('name', 'email'), how='left')['actor_id'] committer = pd.merge( cf, af, left_on=actor_columns[2:], right_on=('name', 'email'), how='left')['actor_id'] commit_frame.insert(3, 'author', author) commit_frame.insert(4, 'committer', committer) if drop_name_email: commit_frame.drop(actor_columns, axis=1, inplace=True) return commit_frame
def eval(name,clip=False,bar=0.9): base = pd.read_csv('../input/stage1_solution_filtered.csv') base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1) sub = pd.read_csv(name) #sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right') #print(sub.head()) y = base['Class'].values yp = sub[['class%d'%i for i in range(1,10)]].values if clip: yp = np.clip(yp,(1.0-bar)/8,bar) yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1]) print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp)) for i in range(9): y1 = y[y==i] yp1 = yp[y==i] print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
def replace(s,n): seen = pd.read_csv(s) unseen = pd.read_csv(n) te = pd.read_csv('../input/stage2_test_variants.csv') tr = pd.read_csv('../input/training_variants') unseen = pd.merge(unseen,te,on='ID',how='right') seen = pd.merge(seen,te,on='ID',how='right') mask = seen.Gene.isin(tr.Gene) cols = ['class%d'%i for i in range(1,10)] seen.loc[~mask,cols] = 0 mask = unseen.Gene.isin(tr.Gene) unseen.loc[mask,cols] = 0 assert (unseen['ID']==seen['ID']).all() seen[cols] = seen[cols] + unseen[cols] seen[cols+['ID']].to_csv('mix.csv',index=False)
def user_info(df): grouped = df[[user_id, power_consumption]].groupby([user_id], as_index=False) user_power_mean = grouped.mean() user_power_median = grouped.median() user_power_var = grouped.var() user_power_max = grouped.max() user_power_min = grouped.min() user_power_mean = user_power_mean.rename(columns={power_consumption: 'user_power_mean'}) user_power_median = user_power_median.rename(columns={power_consumption: 'user_power_median'}) user_power_var = user_power_var.rename(columns={power_consumption: 'user_power_var'}) user_power_max = user_power_max.rename(columns={power_consumption: 'user_power_max'}) user_power_min = user_power_min.rename(columns={power_consumption: 'user_power_min'}) return pd.merge(user_power_mean, user_power_median).merge(user_power_var).\ merge(user_power_max).merge(user_power_min) # ??????????????????????????????
def user_info_m_p(df): date2 = df[record_date].map(lambda x: str2time(x)).max() date1 = datetime.datetime(date2.year, date2.month, 1).date() grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False) user_power_mean_m = grouped.mean() user_power_median_m = grouped.median() user_power_var_m = grouped.var() user_power_max_m = grouped.max() user_power_min_m = grouped.min() user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'}) user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'}) user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'}) user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'}) user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'}) return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\ merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1) # ????????
def user_info_m_p(df): date2 = df[record_date].map(lambda x: str2time(x)).max() date1 = datetime.datetime(date2.year, date2.month, 1).date() grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False) user_power_mean_m = grouped.mean() user_power_median_m = grouped.median() user_power_var_m = grouped.var() user_power_max_m = grouped.max() user_power_min_m = grouped.min() user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'}) user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'}) user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'}) user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'}) user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'}) return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\ merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1) # ??????????????????????????????
def LoadFromTextFile(InputDir): ## raw data TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0) TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0) TestData['parcelid'] = TestData['ParcelId'] TestData.drop('ParcelId', axis=1, inplace=True) PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0) for c, dtype in zip(PropertyData.columns, PropertyData.dtypes): if dtype == np.float64: PropertyData[c] = PropertyData[c].astype(np.float32) ## join dynamic data with static data TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid') TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid') return TrainData,TestData ## class method, save data with pkl format
def planetAndStar(how='inner'): """Read the SWEET-Cat and ExoplanetEU databases and merge them. Input ----- how : str (default: 'inner') How to merge the two DataFrames. See pd.merge for documentation Output ------ d : pd.DataFrame The DataFrame of merged DataFrame c : list The columns that can be used for plotting """ df, columns = readSC() deu = readExoplanetEU() cols = ['stName', 'plMass', 'plRadius', 'period', 'sma', 'eccentricity', 'inclination', 'discovered', 'dist', 'b', 'mag_v', 'mag_i', 'mag_j', 'mag_h', 'mag_k', 'plDensity'] d = pd.merge(df, deu, left_on='Star', right_on='stName', how=how) d['radius'] = list(map(stellar_radius, d['mass'], d['logg'])) d['teq0'] = d.teff * np.sqrt((d.radius*700000)/(2*d.sma*150000000)) c = columns + cols[1:] return d, c
def initialize(self): """ Perform basic checks on provided attributes. """ if isinstance(self.on, str): self.on = [self.on] elif not isinstance(self.on,list): raise Exception('on is not a list of strings. Exit.') assert len(self.on) > 0, 'not specified on which keys to merge.' assert (self.how == 'inner' or self.how == 'outer' or self.how == 'left' or self.how == 'right'), \ 'how to merge not specified correctly.' assert len(self.output_collection), 'output_collection not specified.' # add back on to kwargs, so it's picked up by pandas. if self.on is not None: self.kwargs['on'] = self.on if self.how is not None: self.kwargs['how'] = self.how self.log().info('kwargs passed on to pandas merge function are: %s' % self.kwargs ) return StatusCode.Success
def merge(self, other): """ Merge overlapped guides For example:: from ggplot import * gg = ggplot(aes(x='cut', fill='cut', color='cut'), data=diamonds) gg + stat_bin() This would create similar guides for fill and color where only a single guide would do """ self.key = pd.merge(self.key, other.key) duplicated = set(self.override_aes) & set(other.override_aes) if duplicated: warn("Duplicated override_aes is ignored.") self.override_aes.update(other.override_aes) for ae in duplicated: self.override_aes.pop(ae) return self
def cross_join(df1, df2): """ Return a dataframe that is a cross between dataframes df1 and df2 ref: https://github.com/pydata/pandas/issues/5401 """ if len(df1) == 0: return df2 if len(df2) == 0: return df1 # Add as lists so that the new index keeps the items in # the order that they are added together all_columns = pd.Index(list(df1.columns) + list(df2.columns)) df1['key'] = 1 df2['key'] = 1 return pd.merge(df1, df2, on='key').loc[:, all_columns]
def enrich(self, column, projects): """ This method adds a new column named as 'project' that contains information about the associated project that the event in 'column' belongs to. :param column: column with information related to the project :type column: string :param projects: information about item - project :type projects: pandas.DataFrame :returns: original data frame with a new column named 'project' :rtype: pandas.DataFrame """ if column not in self.data.columns: return self.data self.data = pandas.merge(self.data, projects, how='left', on=column) return self.data
def enrich(self, columns): """ Merges the original dataframe with corresponding entity uuids based on the given columns. Also merges other additional information associated to uuids provided in the uuids dataframe, if any. :param columns: columns to match for merging :type column: string array :return: original dataframe with at least one new column: * uuid: identity unique identifier :rtype: pandas.DataFrame """ for column in columns: if column not in self.data.columns: return self.data self.data = pandas.merge(self.data, self.uuids_df, how='left', on=columns) self.data = self.data.fillna("notavailable") return self.data
def touch_open_indexes(self): """ ??????????? :return: """ quo = self.quotation assert isinstance(quo, pd.DataFrame) if self.open_indexes == self.INDEXES_TYPE_MA: # ???? ma = pd.merge(self.codes[["open_ma", "close"]], quo[["bid1"]], left_index=True, right_index=True) # ????? < ??,? ??? > ?? ma["open_position"] = ma.bid1 > ma.open_ma self.codes.open_position = ma.open_position if self.debug: open_num = ma.open_position.value_counts()[True] self.log.debug("%s ????????? " % open_num)
def get_sell_order(self): """ ??????? :param assert_balance: ??? :return: """ codes = self.codes[self.codes.times != self.codes.exc_times] codes["change"] = codes.exc_times - codes.times sell_codes = codes[codes.change < 0] # ???? sell_codes = pd.merge(sell_codes, self.quotation, left_index=True, right_index=True) # ??????, ????????? sell_priority_index = sell_codes[sell_codes.exc_times < -1].times.argsort()[::-1] # ?? sell_codes.take(sell_priority_index) return sell_codes
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def pickle_correlations_zeros_january(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id from link_features', conn) print 'loaded links' df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn) print 'loaded counts' result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id']) print 'merged counts' print result article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index() article_counts['counts'].fillna(0.0, inplace=True) print article_counts print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def antiSMASH_to_dataFrame(content): """ Extract an antiSMASH file as a pandas.DataFrame """ parsed = parse_antiSMASH(content) output = pd.DataFrame() for cs in parsed['SignificantHits']: clusterSubject = parsed['SignificantHits'][cs].copy() df = pd.merge( pd.DataFrame(clusterSubject['BlastHit']), pd.DataFrame(clusterSubject['TableGenes']), on='subject_gene', how='outer') del(clusterSubject['BlastHit']) del(clusterSubject['TableGenes']) for v in clusterSubject: df[v] = clusterSubject[v] output = output.append(df, ignore_index=True) return output
def get_basicroom_feature(data): t1=data[['orderid','basicroomid']].drop_duplicates()[['basicroomid']] t1['basicroomid_unique_order_cnt']=1 t1=t1.groupby(['basicroomid']).agg('sum').reset_index() t2=data[['orderdate','basicroomid']].drop_duplicates()[['basicroomid']] t2['basicroomid_unique_orderdate_cnt']=1 t2=t2.groupby(['basicroomid']).agg('sum').reset_index() t3=data[['uid','basicroomid']].drop_duplicates()[['basicroomid']] t3['basicroomid_unique_user_cnt']=1 t3=t3.groupby(['basicroomid']).agg('sum').reset_index() t4=data[['basicroomid','roomid']].drop_duplicates()[['basicroomid']] t4['basicroomid_unique_roomid_cnt']=1 t4=t4.groupby(['basicroomid']).agg('sum').reset_index() # basicroom_feature=pd.merge(t,t1,on='basicroomid') basicroom_feature=pd.merge(t1,t2,on='basicroomid') basicroom_feature=pd.merge(basicroom_feature,t3,on='basicroomid') basicroom_feature=pd.merge(basicroom_feature,t4,on='basicroomid') return basicroom_feature
def preprocess(rawdatapath): rawfacts = get_raw(os.path.join(rawdatapath, "facts.json")) rawmetadata = get_raw(os.path.join(rawdatapath, "metadata.json")) parsed_facts = rawfacts.join(pd.DataFrame(rawfacts["_source"].to_dict()).T).drop("_source", axis=1) parsed_metadata = rawmetadata.join(pd.DataFrame(rawmetadata["_source"].to_dict()).T).drop("_source", axis=1) parsed_metadata.rename(columns={"title":"articleTitle"}, inplace=True) clean(parsed_facts) clean(parsed_metadata) parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journalInfo"].to_dict()).T).drop("journalInfo", axis=1) clean(parsed_metadata) parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journal"].to_dict()).T).drop("journal", axis=1) clean(parsed_metadata) df = pd.merge(parsed_facts, parsed_metadata, how="inner", on="cprojectID", suffixes=('_fact', '_meta')) df.rename(columns={"title":"journalTitle"}, inplace=True) df["sourcedict"] = get_dictionary(df) df["term"] = df["term"].map(str.lower) df["wikidataID"] = get_wikidataIDs(df) df.drop_duplicates("_id_fact", inplace=True) return df
def get_hs300s(): """ ????300?????????? Return -------- DataFrame code :???? name :???? date :?? weight:?? """ from tushare.stock.fundamental import get_stock_basics try: wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], ct.PAGES['hs300w']), parse_cols=[0, 3, 6]) wt.columns = ct.FOR_CLASSIFY_W_COLS wt['code'] = wt['code'].map(lambda x :str(x).zfill(6)) df = get_stock_basics()[['name']] df = df.reset_index() return pd.merge(df,wt) except Exception as er: print(str(er))
def get_zz500s(): """ ????500??? Return -------- DataFrame code :???? name :???? """ from tushare.stock.fundamental import get_stock_basics try: # df = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], # ct.PAGES['zz500b']), parse_cols=[0,1]) # df.columns = ct.FOR_CLASSIFY_B_COLS # df['code'] = df['code'].map(lambda x :str(x).zfill(6)) wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], ct.PAGES['zz500wt']), parse_cols=[0, 3, 6]) wt.columns = ct.FOR_CLASSIFY_W_COLS wt['code'] = wt['code'].map(lambda x :str(x).zfill(6)) df = get_stock_basics()[['name']] df = df.reset_index() return pd.merge(df,wt) except Exception as er: print(str(er))
def setUp(self): self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True) self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'}, {'l_id': 2, 'l_attr':''}, {'l_id': 3, 'l_attr':'ab'}, {'l_id': 4, 'l_attr':'ll oo pp'}, {'l_id': 5, 'l_attr':'xy xx zz fg'}, {'l_id': 6, 'l_attr':pd.np.NaN}]) self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'mn'}, {'r_id': 2, 'r_attr':'he ll'}, {'r_id': 3, 'r_attr':'xy pl ou'}, {'r_id': 4, 'r_attr':'aa'}, {'r_id': 5, 'r_attr':'fg cd aa ef'}, {'r_id': 6, 'r_attr':None}]) # generate cartesian product A x B to be used as candset self.A['tmp_join_key'] = 1 self.B['tmp_join_key'] = 1 self.C = pd.merge(self.A[['l_id', 'tmp_join_key']], self.B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr']) self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr']) self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
def label_feature_producer(start_date, end_date, features, labels): labeller = Labeller(start_date, end_date, labels) dataset = labeller.get_labels() dataset['training_end_date'] = start_date dataset['labeling_end_date'] = end_date # dataset feature_grabber = feature_processor.FeatureGrabber(start_date, engine,config_db, con) feature_name_dictionary = {} #print dataset for feature in features: #print feature res_training, feature_names_training = feature_grabber.getFeature(feature) feature_name_dictionary[feature] = feature_names_training res_training.drop_duplicates(inplace = True) dataset = pd.merge(dataset, res_training, on = config_db['id_column'], how = 'left') return dataset, feature_name_dictionary #return None, None
def foo1(): # pandas???????????????????????? data = pd.merge(pd.merge(ratings, users), movies) # ?????????????? # ??????DataFrame?????????????????????????? mean_rating = data.pivot_table('rating', columns='gender', index='title', aggfunc='mean') # ?????????250???? ratings_by_title = data.groupby('title').size() active_titles = ratings_by_title.index[ratings_by_title >= 250] # ??????????250?????????????????mean_rating??????? mean_rating = mean_rating.ix[active_titles] # ??????????????????F????? top_female_ratings = mean_rating.sort_index(by='F', ascending=False) print(top_female_ratings)
def to_df(self): """ Get a dataframe containing the states and value of all nodes of computation :: >>> comp = loman.Computation() >>> comp.add_node('foo', value=1) >>> comp.add_node('bar', value=2) >>> comp.to_df() state value is_expansion bar States.UPTODATE 2 NaN foo States.UPTODATE 1 NaN """ df = pd.DataFrame(index=nx.topological_sort(self.dag)) df[_AN_STATE] = pd.Series(nx.get_node_attributes(self.dag, _AN_STATE)) df[_AN_VALUE] = pd.Series(nx.get_node_attributes(self.dag, _AN_VALUE)) df_timing = pd.DataFrame.from_dict(nx.get_node_attributes(self.dag, 'timing'), orient='index') df = pd.merge(df, df_timing, left_index=True, right_index=True, how='left') return df
def hashtags_multi(search_id): ids = [search_id] ids.extend(request.args.getlist('id')) in_clause = ','.join([str(i) for i in ids]) searches = query(""" SELECT id, date_path, text FROM searches WHERE id in (%s) """ % in_clause) summary = [] search = searches[0] summary.append({'id': search['id'], 'date_path': search['date_path'], 'text': search['text'], 'colname': 'count_%s' % search['id']}) d = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path']) d = d.rename(columns={'count': 'count_%s' % search['id']}) for search in searches[1:]: summary.append({'id': search['id'], 'date_path': search['date_path'], 'text': search['text'], 'colname': 'count_%s' % search['id']}) e = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path']) e = e.rename(columns={'count': 'count_%s' % search['id']}) d = pd.merge(d, e, on='hashtag', how='outer').fillna(0) d.sort_values(by='count_%s' % search_id, inplace=True, ascending=False) result = {'summary': summary, 'hashtags': d.to_dict(orient='record')} return jsonify(result)
def censor_diagnosis(genotype_file,phenotype_file,final_pfile, field ='na',start_time=float('nan'),end_time=float('nan')): import pandas as pd import numpy as np genotypes = pd.read_csv(genotype_file) phenotypes = pd.read_csv(phenotype_file) mg=pd.merge(phenotypes,genotypes,on='id') if np.isnan(start_time) and np.isnan(end_time): print("Choose appropriate time period") if field=='na': if np.isfinite(start_time) and np.isnan(end_time): final = mg[mg['AgeAtICD']>start_time] elif np.isnan(start_time) and np.isfinite(end_time): final = mg[mg['AgeAtICD']<end_time] else: final = mg[(mg['AgeAtICD']>start_time)&(mg['AgeAtICD']<end_time)] else: mg['diff']=mg[field]-mg['AgeAtICD'] if np.isfinite(start_time) and np.isnan(end_time): final = mg[(mg['diff']>start_time)|(np.isnan(mg['diff']))] elif np.isnan(start_time) and np.isfinite(end_time): final = mg[(mg['diff']<end_time)|(np.isnan(mg['diff']))] else: final = mg[(mg['diff']>start_time)&(mg['diff']<end_time)|(np.isnan(mg['diff']))] final[['id','icd9','AgeAtICD']].to_csv(final_pfile)
def main(): # read and preprocess the movie data movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python') movie = movie_preprocessing(movie) # read the ratings data and merge it with movie data rating = pd.read_table("ratings.dat", sep="::", names=["user_id", "movie_id", "rating", "timestamp"], engine='python') data = pd.merge(rating, movie, on="movie_id") # extract feature from our data set streaming_batch, user_feature, actions, reward_list = feature_extraction(data) streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False) user_feature.to_csv("user_feature.csv", sep='\t') pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False) reward_list.to_csv("reward_list.csv", sep='\t', index=False) action_context = movie[movie['movie_id'].isin(actions)] action_context.to_csv("action_context.csv", sep='\t', index = False)
def read_data_table_test(test_tables): test = pd.read_csv('../input/sample_submission.csv')[['image_name']] i = 0 for t_path in test_tables: data = pd.read_csv(t_path) data.rename(columns={'Type_1': 'Type_1_num_{}'.format(i), 'Type_2': 'Type_2_num_{}'.format(i), 'Type_3': 'Type_3_num_{}'.format(i), }, inplace=True) test = pd.merge(test, data, how='left', on='image_name', left_index=True) i += 1 ''' resolutions = pd.read_csv("../modified_data/resolutions_and_color_features_1.csv") resolutions = resolutions[resolutions['type'] == 'test'] resolutions.drop(['type'], axis=1, inplace=True) test = pd.merge(test, resolutions, how='left', on='image_name', left_index=True) ''' return test
def read_data(file_01, file_02): data_01= pd.read_csv( file_01, parse_dates={'timeline': ['btce-time_stamp']}, infer_datetime_format=True) data_02 = pd.read_csv( file_02, parse_dates={'timeline': ['epoch_time_stamp']}, infer_datetime_format=True) data_02 = data_02.drop_duplicates('epoch') data_01['timeline'] = data_01['timeline'].astype(float) data_02['timeline'] = data_02['timeline'].astype(float) data_ = data_02.set_index('timeline').reindex(data_01.set_index('timeline').index, method='nearest').reset_index() data = pd.merge(data_01, data_, on='timeline', suffixes=('_', '')) return data
def digPHconveseRateV2(data_type='train'): df_output='' save_path='' if data_type=='train': df_output=pd.read_csv('data/cutData/train_time_v7.csv') save_path='data/cutData/train_time_v8.csv' elif data_type=='test': df_output=pd.read_csv('data/first_merge/test_join_v7.csv') save_path='data/first_merge/test_join_v8.csv' else: print('data_type???') return df_userPH=pd.read_csv('data/feature/PL_user.csv') df_output=pd.merge(df_output,df_userPH,how='left',on='userID') del df_userPH print('???......') df_output.to_csv(save_path,index=False)
def predict_test_prob(bst): df_all=loadCSV('data/first_merge/test_join_v9.csv') df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv') print('????') df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID') del df_sta_lgbm instanceID=df_all.instanceID.values feature_all=df_all.drop(['label','clickTime','instanceID', 'residence','appCategory'],axis=1).values del df_all dtest=xgb.DMatrix(feature_all) prob=bst.predict(dtest) output=pd.DataFrame({'instanceID':instanceID,'prob':prob}) output.to_csv('result/submission2.csv',index=False) #????
def predict_test_prob(lgbm): df_all=loadCSV('data/first_merge/test_join_v9.csv') df_sta_xgb=loadCSV('data/stacking/prob_xgb_test.csv') print('????') df_all=pd.merge(df_all,df_sta_xgb,how='left',on='instanceID') del df_sta_xgb instanceID=df_all.instanceID.values feature_all=df_all.drop(['label','clickTime','instanceID', 'residence','appCategory'],axis=1).values prob = lgbm.predict(feature_all, num_iteration=lgbm.best_iteration) output=pd.DataFrame({'instanceID':instanceID,'prob':prob}) output.to_csv('result/submission.csv',index=False) #????
def cut_install(): df_installed=pd.read_csv('data/origin/user_installedapps.csv') df_app_cat=pd.read_csv('data/origin/app_categories.csv') print('????') total=len(df_installed) p_len=total//10 for i in range(9): print('?????%d??'%(i+1)) df_part=df_installed[i*p_len:(i+1)*p_len] df_part=pd.merge(df_part,df_app_cat,how='left',on='appID') p_name='data/feature/install_cut/cut_p'+str(i+1)+'.csv' df_part.to_csv(p_name,index=False) del df_part print('????????') df_part=df_installed[9*p_len:] df_part=pd.merge(df_part,df_app_cat,how='left',on='appID') df_part.to_csv('data/feature/install_cut/cut_p10.csv',index=False) del df_part
def iter_over_stocks(stock_lists, db): final_result = {} for x in stock_lists: for y in stock_lists: data1 = db.select_data(stock_lists[0], begin='2010-01-01', end='2015-12-30') data_frame = db.get_dataframe('sh600741', begin='2010-01-01', end='2015-12-30') data_frame = data_frame.set_index(data_frame.date) data_frame1 = db.get_dataframe('sh601668', begin='2010-01-01', end='2015-12-30') data_frame1 = data_frame1.set_index(data_frame1.date.values) base_data1 = raise_value(data_frame) base_data2 = raise_value(data_frame1) result = pd.merge(base_data1, base_data2, left_index=True, right_index=True, how='inner') result = window_similarity(result) final_result.update({x + y: result}) return final_result
def summary(self): """ This function is used to summary the result. If you want calculate some other indicator, you can add them here. :return: """ if self._analysis is not None: self._analysis(self.asset_dict) # for x in self.asset_dict: # self.get_benchmark() # asset_return = (self.asset_dict[x] - self._base_fund) / self._base_fund # asset_return = asset_return.add_prefix(str(x) + "_") # print asset_return # result = pd.merge(asset_return, self._benchmark_data, # left_index=True, right_index=True, how="inner") # max_return = self.get_max_return(x, begin=self._begin_date, end=self._end_date) # print max_return # # print result # # if self._analysis is not None: # # self._analysis(result) # # result.plot() # # plt.show()
def eval_f12(pred, real): ''' param: pred --> dataframe real --> dataframe ''' real['label'] = 1 pred = pd.merge(pred, real, on=['user_id', 'sku_id'], how='left') pred.fillna(0, inplace=True) p = pred.label.mean() r = np.sum(pred.label) / real.shape[0] f12 = (5 * p * r) / (2 * r + 3 * p) real.drop(['label'], axis=1, inplace=True) print('<---------------?????--------------->') print('f12???--->: {0}'.format(p)) print('f12???--->: {0}'.format(r)) print('f12??-->: {0}'.format(f12)) return f12
def load_UCPair_onlyact(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]): ''' ??????????????????? ''' df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'cate']) df = df.drop_duplicates() temp = df.groupby(['user_id']).size().reset_index(name='ncate') df = pd.merge(df, temp, on=['user_id'], how='left') df = df[df.cate==8] df['ncate'] = (df['ncate'] == 1).astype(int) return df[['user_id', 'cate', 'ncate']] # def get_uid_label(start_date = '2016-02-01 00:00:00', end_date = '2016-04-15 00:00:00'): # dump_path = './cache/uid_label_{0}_{1}.pkl'.format(start_date[:10], end_date[:10]) # if os.path.exists(dump_path): # with open(dump_path, 'rb') as f: # df = pickle.load(f) # else: # df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'type']) # df = df[df.type==4].user_id.drop_duplicates().to_frame() # with open(dump_path, 'wb') as f: # pickle.dump(df, f) # return df
def load_brand_comment_ratio(end_date = '2016-04-01 00:00:00'): ''' ????? ''' dump_path = './cache/brand_comment_ratio_{0}.pkl'.format(end_date[:10]) if os.path.exists(dump_path): with open(dump_path, 'rb') as f: df = pickle.load(f) else: item_feat = load_base_item_feat(end_date = end_date) item_feat = item_feat[['sku_id', 'bad_comment_rate']] brands = get_action_data(start_date = '2016-02-01 00:00:00', end_date = end_date, field=['sku_id', 'brand']) brands = brands.drop_duplicates() df = pd.merge(item_feat, brands, on=['sku_id'], how='left') df = df[['brand', 'bad_comment_rate']] df = df.groupby(['brand'], as_index=False).mean() df.columns = ['brand', 'brand_bad_comment_rate'] with open(dump_path, 'wb') as f: pickle.dump(df, f) return df
def fetch_feature_1(train_feature_path, finnal_feature_data_path): import pandas as pd train_feature_data = pd.read_csv(train_feature_path) print u'?????????....' deal_with_user_data() deal_with_comment_data() #user_info = delete_user_info_no_1() user_info = pd.read_csv(user_data_final_path) comment_info = pd.read_csv(comment_data_final_path) print u'??????....' merge_feature_and_user_info = pd.merge(train_feature_data,user_info, on='user_id') merge_feature_and_user_info_and_comment = pd.merge(merge_feature_and_user_info,comment_info, on='sku_id') merge_feature_and_user_info_and_comment.to_csv(finnal_feature_data_path, index=False) print u'???????????:',finnal_feature_data_path #generate_feature_1.fetch_feature_1(train_one_train_feature_path_pre1, # train_one_train_feature_path_pre2)
def test(): #?????????????????????? before_2_days_feature = pd.read_csv(one_before_2_days_feature_path) before_4_days_feature = pd.read_csv(one_before_4_days_feature_path) before_6_days_feature = pd.read_csv(one_before_6_days_feature_path) before_8_days_feature = pd.read_csv(one_before_8_days_feature_path) # ???? print u'??????????',before_2_days_feature.shape print u'??????????',before_4_days_feature.shape print u'??????????',before_6_days_feature.shape print u'??????????',before_8_days_feature.shape # ???????????? new_data_df1 = pd.merge(before_2_days_feature, before_4_days_feature, on=['user_id','sku_id'], how='outer') new_data_df2 = pd.merge(before_6_days_feature, before_8_days_feature, on=['user_id','sku_id'], how='outer') new_data_df = pd.merge(new_data_df1, new_data_df2, on=['user_id','sku_id'], how='outer')
def calculate_query_bin_bits(tfidf): #this also needs to return the table from redis as well as the bin id table = str2int( ujson.loads( r.get('table') ) ) dim = int( r.get('dim') ) mapping = ujson.loads( r.get('map') ) mapping = pd.DataFrame({'word': mapping}) num_vectors = 16 words = list(tfidf.keys()) values = list(tfidf.values()) tfidf_df = pd.DataFrame({'word': words, 'value': values}) article_representation = pd.merge(mapping, tfidf_df, on='word', how='left').fillna(0)['value'] bin_vectors = generate_random_vectors(num_vectors, dim) powers_of_two = 1 << np.arange(num_vectors-1, -1, -1) query_bin_bits = (article_representation.dot(bin_vectors) >= 0) return query_bin_bits, table
def folderToDict(model_output_path, c, probas=None): logging.info("merge folder %s" % model_output_path) files = glob.glob(os.path.join(model_output_path, "*.csv")) if probas is None: probas = defaultdict(lambda: defaultdict(float)) for f in files: logging.info("loading... %s" % f) df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float}) for i in range(len(df)): probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i] try: with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score: logging.info("map3=%6.6f" % float(f_score.read())) except: pass return probas
def parseDict(probas, output_name, valid_file=None): df = pd.DataFrame() df["row_id"] = probas.keys() df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0), sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3])) if valid_file is not None: df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"]) df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True) df_merge = pd.merge(df, df_valid, how="left", on="row_id") valid_score = metrics.mapk(df_merge.place_id_label.values[:, None], df_merge.place_id.values, 3) logging.info("total validation score: %f" % valid_score) np.savetxt("%s.txt" % output_name, [valid_score], fmt="%f") del df_valid del df_merge df["place_id"] = df["place_id"].apply(lambda x: " ".join(map(str, x))) df.to_csv("%s.csv" % output_name, index=False)
def folderToDict(model_output_path, c=1., probas=None): logging.info("merge folder %s" % model_output_path) files = glob.glob(os.path.join(model_output_path, "*.csv")) if probas is None: probas = defaultdict(lambda: defaultdict(float)) for f in files: logging.info("loading... %s" % f) df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float}) for i in range(len(df)): probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i] try: with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score: logging.info("map3=%6.6f" % float(f_score.read())) except: pass return probas
def main(): opts = get_parser().parse_args() rater_1 = pd.read_csv(opts.rater_1)[['participant_id', 'check-1']] rater_2 = pd.read_csv(opts.rater_2)[['participant_id', 'check-1']] rater_1.columns = ['participant_id', 'rater_1'] rater_2.columns = ['participant_id', 'rater_2'] merged = pd.merge(rater_1, rater_2, on='participant_id', how='outer') idcol = 'participant_id' if opts.mapping_file: idcol = 'subject_id' name_mapping = pd.read_csv( opts.mapping_file, sep=' ', header=None, usecols=[0, 1]) name_mapping.columns = ['subject_id', 'participant_id'] name_mapping['participant_id'] = name_mapping.participant_id.astype(str) + '.gif' merged = pd.merge(name_mapping, merged, on='participant_id', how='outer') merged[[idcol, 'rater_1', 'rater_2']].sort_values(by=idcol).to_csv(opts.output, index=False)
def calculate_improvement_metrics(final_consensus, scored_df, tm_eval_df, hgm_df, metrics): """For coding transcripts, how much did we improve the metrics?""" tm_df = tm_eval_df.reset_index()[['TransMapOriginalIntronsPercent', 'TranscriptId']] hgm_df_subset = hgm_df[hgm_df['AlignmentId'].apply(tools.nameConversions.aln_id_is_transmap)] hgm_df_subset = hgm_df_subset[['TranscriptId', 'IntronAnnotSupportPercent', 'IntronRnaSupportPercent']] tm_df = pd.merge(tm_df, hgm_df_subset, on='TranscriptId') df = pd.merge(tm_df, scored_df.reset_index(), on='TranscriptId', suffixes=['TransMap', '']) df = df.drop_duplicates(subset='AlignmentId') # why do I need to do this? df = df.set_index('AlignmentId') metrics['Evaluation Improvement'] = {'changes': [], 'unchanged': 0} for aln_id, c in final_consensus: if c['transcript_biotype'] != 'protein_coding': continue if 'transMap' in c['transcript_modes']: metrics['Evaluation Improvement']['unchanged'] += 1 continue tx_s = df.ix[aln_id] metrics['Evaluation Improvement']['changes'].append([tx_s.TransMapOriginalIntronsPercent, tx_s.IntronAnnotSupportPercentTransMap, tx_s.IntronRnaSupportPercentTransMap, tx_s.OriginalIntronsPercent_mRNA, tx_s.IntronAnnotSupportPercent, tx_s.IntronRnaSupportPercent, tx_s.TransMapGoodness, tx_s.AlnGoodness_mRNA])
def get_users_orders(self, prior_or_train): ''' get users' prior detailed orders ''' if os.path.exists(self.cache_dir + 'users_orders.pkl'): with open(self.cache_dir + 'users_orders.pkl', 'rb') as f: users_orders = pickle.load(f) else: orders = self.get_orders() order_products_prior = self.get_orders_items(prior_or_train) users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id', 'order_number', 'days_up_to_last']], on = ['order_id'], how = 'left') with open(self.cache_dir + 'users_orders.pkl', 'wb') as f: pickle.dump(users_orders, f, pickle.HIGHEST_PROTOCOL) return users_orders