Python pandas 模块,dataframe() 实例源码

我们从Python开源项目中,提取了以下18个代码示例,用于说明如何使用pandas.dataframe()

项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def _df_initial_fixer(df, word, sample=60000):
    '''
    function:
    - ramdomly select rows (image) "sample" times from the df dataframe
    and delete features that are not used in ensemble method modeling

    input:
        df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
        word = name of topic ig "cat" [str]
        sample = number of sample you want to extract from df [int]

    output:
    new data frame!

    '''
    print "total number of images for df_{}: {}".format(word, len(df))
    random_index = np.random.choice(list(df.index), sample, replace=False)
    df = df.loc[list(random_index)]
    df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
                        'X_per_stroke','Y_per_stroke','time_per_stroke',\
                        'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
                        'direction'], axis=1)
    return df_test
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def _df_initial_fixer_cc(df, word):
    '''
    prepares training and test X and Y for xgboost test for countrycode classifier

    function:
    - delete features that are not used in ensemble method modeling

    input:
        df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
        word = name of topic ig "cat" [str]

    output:
    new data frame!

    '''
    df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
                        'X_per_stroke','Y_per_stroke','time_per_stroke',\
                        'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
                        'direction'], axis=1)
    return df_test
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def _country_initial_fixer(df,country,limit):
    '''
    Function:
    extracts data by country and ramdomly select "limit" amount of data from that dataset

    Input:
    df = dataframe (should contain 'countrycode' features) [dataframe]
    country = should be 2 capital letter country code[string]
    limit = max number of rows (data) you want to take into the new data frame

    Output:
    dataframe contains data from selected country (# of data <= limit)

    note: uses random.seed(32113)
    '''
    if df[df['countrycode']==country].count()[0] > limit:
        df_c = df[df['countrycode']==country]
        random_c = np.random.choice(list(df_c.index), limit, replace=False)
        df_c = df_c.loc[list(random_c)]
    else:
        df_c = df[df['countrycode']==country]
    return df_c
项目:foamBazar    作者:BV-DR    | 项目源码 | 文件源码
def postProcessingDatFile(fname, objName=None, root='./'):
    if objName!=None:
        dataFolder = postProcessingFolder(objName, root=root)
        timeNames = timeFolder(root=dataFolder)
    else:
        dataFolder = addslash(root)
        timeNames = []
    if len(timeNames)==0: timeNames=['']    # at least check the current folder
    keyName = os.path.basename(rmslash(fname))
    keyName = os.path.splitext(keyName)[0]
    datFiles = []
    for subdir in timeNames:
        found = filesOnly(sorted(glob.glob(dataFolder + subdir + "/" + keyName + "*.dat")))
        for f in found: datFiles.append(f)
    return datFiles

# concat dataframe and optionally merge xAxis
# When overlap, either keep 'last', 'first', or 'False'
# list_of_data must be of type pandas.dataframe
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def feature_eng_pt3(df_cf):
    '''
    function:
    - feature engineering pt3
      need to run this after feature_eng_pt2 since pt4 and pt5
      uses features created in this function.

    - Create following features:
      direction = direction of stroke (from first XY points to last XY points)
                    in radian (0 to 6.28...) [float]

    input:
      df_cf = output dataframe from feature_eng_pt2

    output:
      dataframe with above features and filter

    the way I approached this is by finding the first and last x,y locations for each stroke and
    I then calculated delta x (dx) and delta y (dy).
    from there, I just calculated the direction of the stroke in radian using my user defined function "_radian_direction"
    '''
    direction = {}
    for index in df_cf.index:
        dx = [float(df_cf.drawing[index][stroke][1][-1] - df_cf.drawing[index][stroke][1][0]) \
          for stroke in xrange(df_cf.stroke_number[index])]
        dy = [float(df_cf.drawing[index][stroke][0][-1] - df_cf.drawing[index][stroke][0][0]) \
          for stroke in xrange(df_cf.stroke_number[index])]
        dx = np.array(dx)
        dy = np.array(dy)
        dx[dx==0] = 0.000001
        vecrad_direction = np.vectorize(_radian_direction)
        direction[index] = vecrad_direction(dy,dx)
    df_cf['direction'] = pd.Series(direction)
    return df_cf
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def load_json(filename):
    '''
    Function:
        - opens json file and store information in a pandas dataframe
        - also prints out aggregated df with counts of picture by countrycode
    Input:
        1. filename/path ex: ./data/filename.json
    Output:
        1. new dataframe containing json info
    '''
    df = pd.read_json(filename, lines=True)
    test = df.groupby(df['countrycode']).count()
    print test.sort(columns='drawing',ascending=False).head(15)
    return df
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def pic_viewer(df_cf, _id):

    '''
    Function:
        - If X and Y columns exist in your dataframe, you can use this function
                            to view drawing with specific id.
        - run this after running CNN_feat_eng_pt1 or feature_eng_pt2
    Input:
        1. dataframe df_cf
        2. object id _id
    Output:
        1. scatter plot of x and y
    '''
    plt.scatter(df_cf.X[_id],df_cf.Y[_id])
    plt.gca().invert_yaxis()
项目:2020plus    作者:KarchinLab    | 项目源码 | 文件源码
def random_sort(df, prng=None):
    """Randomly shuffle a DataFrame.

    NOTE: if the training data is not randomly shuffled, then
    supervised learning may find artifacts related to the order
    of the data.

    Parameters
    ----------
    df : pd.DataFrame
        dataframe with feature information

    Returns
    -------
    df : pd.DataFrame
        Randomly shuffled data frame
    """
    # get new random state if not specified
    if prng is None:
        prng = np.random.RandomState()

    # get random order
    random_indices = prng.choice(df.index.values,  # sample from 'genes'
                                 len(df),  # number of samples
                                 replace=False)  # sample without replacement

    # change order of df
    random_df = df.ix[random_indices].copy()

    return random_df
项目:2020plus    作者:KarchinLab    | 项目源码 | 文件源码
def process_mutational_features(mydf):
    """Performs feature processing pipeline.

    Parameters
    ----------
    mydf : pd.DataFrame
        data frame containing the desired raw data for computation of
        features for classifier

    Returns
    -------
    proc_feat_df: pd.DataFrame
        dataframe consisting of features for classification
    """
    # rename process of columns to ensure compatability with previously
    # written code
    mydf = mydf.rename(columns={'Protein_Change': 'AminoAcid',
                                'DNA_Change': 'Nucleotide'})

    # process features
    feat_list = fmat.generate_feature_matrix(mydf, 2)
    headers = feat_list.pop(0)  # remove header row
    feat_df = pd.DataFrame(feat_list, columns=headers)  # convert to data frame
    proc_feat_df = normalize_mutational_features(feat_df, 0)
    miss_ent_df = pentropy.missense_position_entropy(mydf[['Gene', 'AminoAcid']])
    # mut_ent_df = pentropy.mutation_position_entropy(mydf[['Gene', 'AminoAcid']])

    # encorporate entropy features
    #proc_feat_df['mutation position entropy'] = mut_ent_df['mutation position entropy']
    #proc_feat_df['pct of uniform mutation entropy'] = mut_ent_df['pct of uniform mutation entropy']
    proc_feat_df['missense position entropy'] = miss_ent_df['missense position entropy']
    proc_feat_df['pct of uniform missense entropy'] = miss_ent_df['pct of uniform missense entropy']
    return proc_feat_df
项目:adel    作者:openalea-incubator    | 项目源码 | 文件源码
def pandadf2adeldict(df):
    ''' convertit un dataframe panda en dictionaire de vecteur numpy '''
    d = df.to_dict()
    return dict((k,np.array([v for v in dv.itervalues()])) for k, dv in d.iteritems())
项目:inferelator_ng    作者:simonsfoundation    | 项目源码 | 文件源码
def compute_transcription_factor_activity(self, allow_self_interactions_for_duplicate_prior_columns = True):
        # Find TFs that have non-zero columns in the priors matrix
        non_zero_tfs = self.prior.columns[(self.prior != 0).any(axis=0)].tolist()

        # Delete tfs that have neither prior information nor expression
        delete_tfs = set(self.prior.columns).difference(self.prior.index).difference(non_zero_tfs)
        # Raise warnings
        if len(delete_tfs) > 0:
            message = " ".join([str(len(delete_tfs)).capitalize(),
             "transcription factors are removed because no expression or prior information exists."])
            warnings.warn(message)
            self.prior = self.prior.drop(delete_tfs, axis = 1)

        # Create activity dataframe with values set by default to the transcription factor's expression
        activity = pd.DataFrame(self.expression_matrix.loc[self.prior.columns,:].values,
                index = self.prior.columns,
                columns = self.expression_matrix.columns)

        # Find all non-zero TFs that are duplicates of any other non-zero tfs
        is_duplicated = self.prior[non_zero_tfs].transpose().duplicated(keep=False)
        duplicates = is_duplicated[is_duplicated].index.tolist()

        # Find non-zero TFs that are also present in target gene list 
        self_interacting_tfs = set(non_zero_tfs).intersection(self.prior.index)

        # If this flag is set to true, don't count duplicates as self-interacting when setting the diag to zero
        if allow_self_interactions_for_duplicate_prior_columns:
            self_interacting_tfs = self_interacting_tfs.difference(duplicates)

        # Set the diagonal of the matrix subset of self-interacting tfs to zero
        subset = self.prior.loc[self_interacting_tfs, self_interacting_tfs].values
        np.fill_diagonal(subset, 0)
        self.prior.at[self_interacting_tfs, self_interacting_tfs] = subset

        # Set the activity of non-zero tfs to the pseudoinverse of the prior matrix times the expression
        if non_zero_tfs:
            activity.loc[non_zero_tfs,:] = np.matrix(linalg.pinv2(self.prior[non_zero_tfs])) * np.matrix(self.expression_matrix_halftau)

        return activity
项目:foamBazar    作者:BV-DR    | 项目源码 | 文件源码
def usage():
    print '''
# template for loading openfoam data into pandas.dataframe

import sys
sys.path.append("/home/soseng/OpenFOAM/bv/foamBazar/pythonScripts/")
import fsData as fs
from matplotlib import pyplot as plt
if __name__ == "__main__":
    log = fs.loadLogData("-p res -w init,Ux,Uy,Uz", logfiles=['log.run','fsLog'])
    mot = fs.loadMotionInfo("motionInfo", root='./')
    vbm = fs.loadInternalLoads("vbm", root='./', fnames=['my','fz','acc'])    
    '''
    pass
项目:foamBazar    作者:BV-DR    | 项目源码 | 文件源码
def setmetadata(data, label=None, info=None, module=None, args=None):
    data.fsData = deepcopy(FSDATA)
    data.fsData['label'] = label
    data.fsData['info'] = info if info!=None else 'last update: ' + datetime.date.today().strftime("%I:%M%p %B %d, %Y")
    data.fsData['module'] = module
    data.fsData['args'] = args if args!=None else {
        'lastUpdate':datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    }

# load log data given a list of logfiles/folder
# data will be merged and return as pandas.dataframe
# cmd: is fsPlot.py command line arguments
# e.g.: loadLogData('-p res', logfiles=['log.run0','log.run1',''])
# When overlap, either keep 'last', 'first', or 'False'
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\
                                            countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates multiple user defined functions to create dataframe for ensemble method modeling.
    - it also prints out how long it takes to run
    - processes google quickdraw raw data dataframe
    - after this processing, dataframe contains 404 features
    - the output of this function will be used for ensemble method modeling.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')  
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country code used in country prediction

    output:
    - pickled dataframe that will be used for ensemble method (404 features)
    filename: "./data/MY_feature_{}.pkl".format(category)
    '''
    start_time = time.time()
    #runs feature_eng_pt1 through pt5.
    df_test1 = feature_eng_pt1(df)
    df_test2 = feature_eng_pt2(df_test1)
    df_test3 = feature_eng_pt3(df_test2)
    df_subset = feature_eng_pt4(df_test3)
    df_subset2 = feature_eng_pt5(df_test3)
    df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1)

    # prepares final dataframe
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_final.index = xrange(len(df_final))
        random_ind = np.random.choice(list(df_final.index), sample, replace=False)
        df_final = df_final.loc[list(random_ind)]
    #if purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_final = df_final[(df_final['countrycode']==countries[0])|\
                (df_final['countrycode']==countries[1])|\
               (df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])]
    df_final.index = df_final['key_id']
    df_final.to_pickle("./data/MY_feature_{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']):
    '''
    function:
    - aggregates 2 user defined functions that prepares dataframe for CNN modeling.
    - it also prints out how long it takes to run.

    input:
    - df = dataframe that was converted from raw_data json file
    - category = used to name output pickle file
    - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word') 
    - purpose = 'word' or 'country'. prepares data for different purposes.
        'word' for image recognition, 'country' for country prediction
    - countries = list of country codes used in country prediction

    output:
    - pickled dataframe that will be used for CNN modeling (1176 features)
    - each row represents 42 by 28 pixel image
    file name: "./data/{}.pkl".format(category)
    '''

    start_time = time.time()
    #runs CNN feature engineering functions
    df_1 = CNN_feat_eng_pt1(df)
    df_2 = CNN_feat_eng_pt2(df_1)
    #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
    if purpose == 'word':
        df_2.index = xrange(len(df_2))
        random_ind = np.random.choice(list(df_2.index), sample, replace=False)
        df_2 = df_2.loc[list(random_ind)]
    #If purpose = 'country', it will correct all datapoints from the selected countries.
    elif purpose == 'country':
        df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\
               (df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])]
    df_2.index = df_2['key_id']
    df_2.to_pickle("./data/{}.pkl".format(category))
    print("--- %s seconds ---" % (time.time() - start_time))
    return df_2



##############################################################################
#           functions for feature engineeering for ensemble methods          #
##############################################################################
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def feature_eng_pt1(df_cf):

    '''
    function:
    - feature engineering pt1
      need to run this first since pt2 to pt5 uses features created
      in this function.

    - create following features:
      stroke_number = total stroke number of an image [int]
      final time = time of the last datapoints for an image (how long it took user to draw) [int]
      recognized = changed True/False response to boolean
                              (1 is true, 0 is false)[int]

    - Filtering applied:
      1: filtered out data where recognize == 0. 
          Having unrecognized images in the dataset may reduce prediction accuracy
      2: filtered out data where stroke_number is greater than 15
          After analysis, most pics were drawn under 15 strokes. 
          I'm suspecting that if stroke numbers are above 20 or 30, users might be using a graphic tablet. 
          In this project, I tried to exclude those images above 15 strokes.
          So that I keep all images that are drawn in the similar environment.
      3: filtered out data where final time is greater than 20000
          I do not know how this happens but some images have time values that are more than 20000.
          The quickdraw ask users to draw in 20sec so I am a bit puzzled how these users draw for more than 20000ms.

    input:
    df = dataframe created from Google quickdraw raw data json file

    output:
    dataframe with additional features mentioned above
    '''
    # create feature "stroke_number"
    df_cf['stroke_number']=df_cf['drawing'].str.len()

    #create feature "final_time"
    df_cf['final_time'] = [df_cf.loc[index,'drawing']\
                [df_cf.stroke_number[index]-1][2][-1] for index in df_cf.index]

    #setting boolean and changing recognized features to 1 and 0.
    b_loon = {True: 1, False:0}
    df_cf['recognized'] = df_cf['recognized'].map(b_loon)

    #filtered data by stroke number, recognized and final time features
    df_cf = df_cf[(df_cf['recognized']==1) & (df_cf['stroke_number'] <= 15)]
    df_cf = df_cf[(df_cf['final_time']<=20000)]
    return df_cf
项目:quickdraw_prediction_model    作者:keisukeirie    | 项目源码 | 文件源码
def feature_eng_pt4(df_cf):
    '''
    function:
    - feature engineering pt4
      create new dataframe that need to be combined with output dataframe
      of feature_eng_pt3
    - it creates 5 features per 1 stroke.
    - this function will creates these 5 features for first 15 strokes of an image

    - Create following features:
      datapoint_percentage_stroke'i' = # of data points in stroke i divide by
                            total number of data points of an image. [float]
            * do not confuse with dp_percent_per_stroke column I previously made.
            dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float!

      direction_stroke'i' = direction of stroke 'i' [float]

      time_stroke'i' = total time spent on stroke'i' [int]

      datapoints_stroke'i' = number of data points in stroke i [int]

      switch_stroke'i' = boolean indicates whether stroke'i' exist in an image
                            0: stroke exist 1: stroke does not exist [int]

    input:
      df_cf = output dataframe from feature_eng_pt3

    output:
      new dataframe with 75 features (5 * 15 features)
    '''

    ar = np.zeros((len(df_cf),75))
    c = 0
    for index_ in df_cf.index:
        stroke = (df_cf.stroke_number[index_])
        ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_])
        ar[c][15:15+stroke] = np.array(df_cf['direction'][index_])
        ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_])
        ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_])
        ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke))
        c += 1
    subset = pd.DataFrame(ar)
    subset.index = df_cf.index
    for num in xrange(15):
        subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)})
    for num in xrange(15,30):
        subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)})
    for num in xrange(30,45):
        subset = subset.rename(columns={num:"time_stroke{}".format(num-30)})
    for num in xrange(45,60):
        subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)})
    for num in xrange(60,75):
        subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)})
    return subset
项目:chxanalys    作者:yugangzhang    | 项目源码 | 文件源码
def extract_data_from_file(  filename, filepath, good_line_pattern, good_cols=None, labels=None,):
    '''YG Develop Octo 17, 2018 
        Extract data from a file
    Input:
        filename: str, filename of the data
        filepath: str, path of the data
        good_line_pattern: str, data will be extract below this good_line_pattern
        good_cols: list of integer, good index of cols
        lables: the label of the good_cols
        #save: False, if True will save the data into a csv file with filename appending csv ??
    Return:
        a pds.dataframe
    Example:
    filepath =  '/XF11ID/analysis/2017_3/lwiegart/Link_files/Exports/'
    filename = 'ANPES2 15-10-17 16-31-11-84Exported.txt'    
    good_cols = [ 1,2,4,6,8,10 ]
    labels = [  'time', 'temperature', 'force', 'distance', 'stress', 'strain'  ]
    good_line_pattern = "Index\tX\tY\tX\tY\tX\tY" 
    df =  extract_data_from_file(  filename, filepath, good_line_pattern, good_cols, labels)
    '''
    import pandas as pds
    with open( filepath + filename, 'r' ) as fin:
        p=fin.readlines()
        di = 1e20                
        for i, line in enumerate(p):
            if good_line_pattern in line:                
                di = i
            if i == di+1:
                els = line.split()  
                if good_cols is  None:
                    data = np.array( els, dtype=float  )
                else:
                    data = np.array( [els[j] for j in good_cols], dtype=float  )
            elif i > di:
                try:                    
                    els = line.split() 
                    if good_cols is  None:
                        temp = np.array( els, dtype=float  )
                    else:
                        temp=  np.array( [els[j] for j in good_cols], dtype=float  ) 
                    data=np.vstack( (data,temp))
                except:
                    pass
        if labels is None:
            labels = np.arange(data.shape[1])
        df = pds.DataFrame( data, index= np.arange(data.shape[0]), columns= labels  )    
    return df