我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用pandas.factorize()。
def precalculate_factors(self, df): """Precomputes the buckets and labels for the Jackknife object. Args: df: A pandas dataframe. """ if self._unit is None: self._buckets = np.arange(len(df)) self._bucket_labels = np.arange(len(df)) else: self._buckets, names = pd.factorize(df[self._unit]) self._bucket_labels = np.arange(len(names))
def precalculate_factors(self, df): """Initializes the labels for the Bootstrap object. Args: df: A pandas dataframe. """ if self._unit is not None: self._values, labels = pd.factorize(df[self._unit]) self._labels = [self._values == ii for ii in range(len(labels))]
def precalculate_factors(self, data, sort=True): """Initializes the factor variable. Args: data: A pandas dataframe. sort: Boolean indicating whether or not the conditions should be sorted. Raises: ValueError: The baseline key isn't found. """ self.factors, condition_keys = pd.factorize(data[self.condition_column], sort=sort) self.alternate_indices = [ ii for ii, label in enumerate(condition_keys) if self._include_base or label != self.baseline_key ] self.alternate_keys = condition_keys[self.alternate_indices] if any(condition_keys == self.baseline_key): self.baseline_index = np.where(condition_keys == self.baseline_key)[0][0] else: raise ValueError("Baseline value {} not present in column {}".format( self.baseline_key, self.condition_column)) self._baseline_mask = (self.factors == self.baseline_index) self._alternate_masks = {} for ii in self.alternate_indices: self._alternate_masks[ii] = (self.factors == ii)
def __init__(self, metric, dimensions, name=None): """Initializes distribution estimator. Args: metric: Thing to calculate dimensions: Dimensions to distribute things over. name: A string for the column name of results. """ def _calculate(data, weights): """Calculates distribution metric.""" total = 1.0 * _weighted_sum(data[metric].values, weights) dimension_tuples = pd.lib.fast_zip([data[ii].values for ii in dimensions]) factors, keys = pd.factorize(dimension_tuples) results = np.zeros(len(keys)) for ii in xrange(len(keys)): results[ii] = _weighted_sum(data[metric].values, weights * (factors == ii)) / total output = pd.DataFrame(results, index=pd.MultiIndex.from_tuples(keys, names=dimensions), columns=[""]) return output if name is None: name = "{} Distribution".format(metric) super(Distribution, self).__init__(name, _calculate, "dataframe")
def __init__(self, metric, dimensions, ascending=True, name=None): """Initializes distribution estimator. Args: metric: Thing to calculate dimensions: Dimensions to distribute things over. ascending: list of bools to pass to pandas.sort_index that say to sort each dimension ascending or descending. name: A string for the column name of results. """ def _calculate(data, weights): """Calculates cumulative distribution metric.""" total = 1.0 * _weighted_sum(data[metric].values, weights) dimension_tuples = pd.lib.fast_zip([data[ii].values for ii in dimensions]) factors, keys = pd.factorize(dimension_tuples, sort=True) results = np.zeros(len(keys)) for ii in xrange(len(keys)): results[ii] = _weighted_sum(data[metric].values, weights * (factors == ii)) / total output = pd.DataFrame(results, index=pd.MultiIndex.from_tuples(keys, names=dimensions), columns=[""]) output = output.sort_index(ascending=ascending).cumsum() return output if name is None: name = "{} Cumulative Distribution".format(metric) super(CumulativeDistribution, self).__init__(name, _calculate, "dataframe")
def factorize(train, test, features, na_value=-9999, full=False, sort=True): """Factorize categorical features. Parameters ---------- train : pd.DataFrame test : pd.DataFrame features : list Column names in the DataFrame to be encoded. na_value : int, default -9999 full : bool, default False Whether use all columns from train/test or only from train. sort : bool, default True Sort by values. Returns ------- train : pd.DataFrame test : pd.DataFrame """ for column in features: if full: vs = pd.concat([train[column], test[column]]) labels, indexer = pd.factorize(vs, sort=sort) else: labels, indexer = pd.factorize(train[column], sort=sort) train[column] = indexer.get_indexer(train[column]) test[column] = indexer.get_indexer(test[column]) if na_value != -1: train[column] = train[column].replace(-1, na_value) test[column] = test[column].replace(-1, na_value) return train, test
def model_data(data, LECAT=False, NAMEAN=False, NA999=False, OH=False, ONLYCONT=False, ONLYCAT=False, ONLYCATOH=False, COLSREMOVAL=False, cols=[], maxCategories=300): data = data.copy() cat_var = list(data.select_dtypes(["object"]).columns) cont_var = list(data.select_dtypes(["float", "int"]).columns) if COLSREMOVAL: data = data.drop(cols, 1, inplace=False) cat_var = list(data.select_dtypes(["object"]).columns) cont_var = list(data.select_dtypes(["float", "int"]).columns) if NAMEAN: for col in cont_var: data.loc[data[col].isnull(), col] = data[col].mean() if NA999: for col in cont_var: data.loc[data[col].isnull(), col] = -999 if LECAT: for col in data[cat_var]: data[col] = pd.factorize(data[col])[0] if OH: cols2dummy = [col for col in data[cat_var] if len(data[col].unique()) <= maxCategories] colsNot2dummy = [col for col in data[cat_var] if len(data[col].unique()) > maxCategories] data = pd.get_dummies(data, dummy_na=True, columns=cols2dummy) #binning for col in colsNot2dummy: data[col] = pd.factorize(data[col])[0] dcb = DummycolumnsBins(cols=col, prefix=col, nb_bins=2000) dcb.fit(data) pd_binned = dcb.transform(data) data = pd.concat([data,pd_binned],1) if ONLYCONT: data = data[cont_var] if ONLYCAT: test_idx = data['ID'] Y = data['target'] data = data[cat_var] data['ID'] = test_idx data['target'] = Y if ONLYCATOH: test_idx = data['ID'] Y = data['target'] cols = list(set(data.columns).difference(set(cont_var))) ; print(cols) data = data[cols] data['ID'] = test_idx data['target'] = Y return data
def Load_data(): train = pd.read_csv(path_train) test = pd.read_csv(path_test) # combine train and test data_comb = train.append(test) # Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code # create any new variables data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0] data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1] # factorize categorical variables data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0] data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0] data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0] data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age'] med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')] data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1) print('Encode missing values') data_comb.fillna(-1, inplace=True) # fix the dtype on the label column data_comb['Response'] = data_comb['Response'].astype(int) # split train and test train = data_comb[data_comb['Response']>0].copy() test = data_comb[data_comb['Response']<1].copy() target = train['Response'].values le = preprocessing.LabelEncoder() y = le.fit_transform(target) train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True) test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True) train = train.as_matrix() test = test.as_matrix() print('Construct labels for bumping') num_class = len(np.unique(target)) labels = np.zeros(shape=(train.shape[0],num_class-1)) labels[:, 0][target==1]=1 labels[:, 6][target<8]=1 for i in range(1, num_class-2): labels[:, i][target<i+2]=1 return train, test, target, labels