Python pandas 模块,to_numeric() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.to_numeric()

项目:TADPOLE    作者:noxtoby    | 项目源码 | 文件源码
def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag,
                                 ignoreMissingCols = False):
  nrRows, nrCols = tadpoleDF.shape
  colListOtherSS = list(ssDF.columns.values)
  colListTadpoleDF = list(tadpoleDF.columns.values)

  tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \
    tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce')


  tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag]
  for r in range(nrRows):

    valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \
                 np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r]))
    if valsNan:
      continue

    valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r])
    if valsNotEq:
      print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\
        'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r])

  # Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
项目:actions-for-actions    作者:gsig    | 项目源码 | 文件源码
def load_submission(self, submission_file):
        loc_submission = pd.read_csv(submission_file, header=None)
        build_proc_sub = loc_submission[0].str.split(' ').values.tolist()
        assert len(build_proc_sub[0]) == self.n_classes + len(self.submission_columns)
        proc_sub = pd.DataFrame.from_records(build_proc_sub, columns=[self.submission_columns + list(range(self.n_classes))])
        if self.subset is not None:
            if type(proc_sub['frame_id'].values[0]) is np.ndarray:
                mask = [True if x[0] in self.subset else False for x in proc_sub['frame_id'].values]
            else:
                # old pandas version
                mask = [True if x in self.subset else False for x in proc_sub['frame_id'].values]
            proc_sub = proc_sub[mask]
            assert np.any(np.array(mask))
        num_proc_sub = proc_sub.apply(pd.to_numeric, errors='ignore')
        grouped_by_vid = num_proc_sub
        self.submission = grouped_by_vid
项目:betterself    作者:jeffshek    | 项目源码 | 文件源码
def build_dataframe(self):
        if not self.values.exists():
            return pd.DataFrame()

        # Am I really a programmer or just a lego assembler?
        # Pandas makes my life at least 20 times easier.
        df = pd.DataFrame.from_records(self.values, index=self.index_column)

        # make the columns and labels prettier
        if self.rename_columns:
            df = df.rename(columns=self.column_mapping)

        df.index.name = TIME_COLUMN_NAME
        try:
            df.index = df.index.tz_convert(self.user.pytz_timezone)
        except AttributeError:
            # if attribute-error means the index is just a regular Index and
            # that only dates (and not time) was passed
            df.index = pd.DatetimeIndex(df.index, tz=self.user.pytz_timezone)

        # cast it as numerics if possible, otherwise if we're dealing with strings, ignore
        df = df.apply(pd.to_numeric, errors='ignore')

        return df
项目:pyprocessmacro    作者:QuentinAndre    | 项目源码 | 文件源码
def _cond_ind_effects_wrapper(self):
        """
        A wrapper for the conditional indirect effects.
        :return: pd.DataFrame
            A DataFrame of effects, se, llci, and ulci, for the conditional indirect effects.
        """
        symb_to_var = self._symb_to_var
        results = self.estimation_results
        rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
        cols_stats = ["Effect", "Boot SE", "BootLLCI", "BootULCI"]

        mod_values = self._moderators_values
        med_values = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
        values = med_values + mod_values

        rows_levels = np.array([i for i in product(*values)])
        cols_levels = ["Mediator"] + [symb_to_var.get(x, x) for x in self._moderators_symb]

        rows = np.concatenate([rows_levels, rows_stats], axis=1)
        cols = cols_levels + cols_stats
        df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
        return df.apply(pd.to_numeric, args=["ignore"])
项目:pyprocessmacro    作者:QuentinAndre    | 项目源码 | 文件源码
def _simple_ind_effects_wrapper(self):
        """
        A wrapper for the indirect effects (and for total/contrast effects if specified)
        :return: pd.DataFrame
            A DataFrame of effects, se, llci, and ulci, for the simple/total/constrasts of indirect effects.
        """
        symb_to_var = self._symb_to_var
        results = self.estimation_results
        rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T

        med_names = [symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]
        rows_levels = []
        if self._options["total"]:
            rows_levels += ["TOTAL"]
        rows_levels += med_names
        if self._options["contrast"]:
            contrasts = ["Contrast: {} vs. {}".format(a, b) for a, b in combinations(med_names, 2)]
            rows_levels += contrasts
        rows_levels = np.array(rows_levels).reshape(-1, 1)

        rows = np.concatenate([rows_levels, rows_stats], axis=1)
        cols = ["", "Effect", "Boot SE", "BootLLCI", "BootULCI"]
        df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
        return df.apply(pd.to_numeric, args=["ignore"])
项目:pyprocessmacro    作者:QuentinAndre    | 项目源码 | 文件源码
def _PMM_index_wrapper(self):
        """
        A wrapper for the Partial Moderated Mediation index.
        :return: pd.DataFrame
            A DataFrame of effects, se, llci, and ulci, for the PMM index.
        """
        symb_to_var = self._symb_to_var
        results = self._PMM_index()
        rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
        cols_stats = ["Index", "Boot SE", "LLCI", "ULCI"]

        mod_names = [[symb_to_var.get(i, i) for i in self._moderators_symb]]
        med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
        values = mod_names + med_names
        rows_levels = np.array([i for i in product(*values)])
        cols_levels = ["Moderator", "Mediator"]

        rows = np.concatenate([rows_levels, rows_stats], axis=1)
        cols = cols_levels + cols_stats
        df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
        return df.apply(pd.to_numeric, args=["ignore"])
项目:pyprocessmacro    作者:QuentinAndre    | 项目源码 | 文件源码
def _MMM_index_wrapper(self):
        """
        A wrapper for the Moderated Moderated Mediation index.
        :return: pd.DataFrame
            A DataFrame of effects, se, llci, and ulci, for the CMM index.
        """
        symb_to_var = self._symb_to_var
        results = self._MMM_index()
        rows_stats = np.array([results["effect"], results["se"], results["llci"], results["ulci"]]).T
        cols_stats = ["Index", "Boot SE", "BootLLCI", "BootULCI"]

        med_names = [[symb_to_var.get('m{}'.format(i + 1), 'm{}'.format(i + 1)) for i in range(self._n_meds)]]
        rows_levels = np.array([i for i in product(*med_names)])
        cols_levels = ["Mediator"]

        rows = np.concatenate([rows_levels, rows_stats], axis=1)
        cols = cols_levels + cols_stats
        df = pd.DataFrame(rows, columns=cols, index=[""] * rows.shape[0])
        return df.apply(pd.to_numeric, args=["ignore"])
项目:ML_algorithm    作者:luoshao23    | 项目源码 | 文件源码
def clean_data(DT_df, attributes):
    """data preprocessing"""
    # DT_df = DT_df.drop(drop_cols, axis=1)
    DT_df["fs_scan_amt_pre"] = DT_df["fs_scan_amt_pre"].astype(float)
    DT_df["fs_scan_amt_pos"] = DT_df["fs_scan_amt_pos"].astype(float)
    DT_df["fs_scan_amt_pos_PF"] = DT_df["fs_scan_amt_pos_PF"].astype(float)
    DT_df["dyn_margin_amt_pre"] = DT_df["dyn_margin_amt_pre"].astype(float)
    DT_df["dyn_margin_amt_pos"] = DT_df["dyn_margin_amt_pos"].astype(float)
    DT_df["dyn_margin_amt_pos_PF"] = DT_df[
        "dyn_margin_amt_pos_PF"].astype(float)
    DT_df["ctl_grp_ind"] = DT_df["ctl_grp_ind"].astype(int)
    DT_df["mailer_version_id"] = DT_df["mailer_version_id"].astype(int)
    DT_df["tcm_redeem_md"] = pd.to_numeric(DT_df["tcm_redeem_md"])
    for attr in attributes:
        DT_df[attr] = DT_df[attr].astype(int)

    fields = attributes + ["fs_scan_amt_pre", "fs_scan_amt_pos", "fs_scan_amt_pos_PF", "dyn_margin_amt_pre", "dyn_margin_amt_pos", "dyn_margin_amt_pos_PF",
                           "ctl_grp_ind", "mailer_version_id", "tcm_redeem_md", "xtra_card_nbr"]
    DT_df = DT_df[fields]
    return DT_df
项目:atropos    作者:jdidion    | 项目源码 | 文件源码
def _get_table(self, column, is_size=True):
        cols = list(range(5))
        cols.append(self.header.index(column))
        header = [self.header[c] for c in cols]
        rows = [
            [row[c] for c in cols]
            for row in self.rows
        ]
        if is_size:
            for row in rows:
                row[5] = parse_size(row[5])
        table = pd.DataFrame.from_records(rows, columns=header)
        table = table.rename(columns={ 
            'prog' : 'Program',
            'prog2' : 'Program2',
            'threads' : 'Threads',
            'dataset' : 'Dataset',
            'qcut' : 'Quality',
        })
        table['Threads'] = pd.to_numeric(table['Threads'])
        table['Dataset'] = pd.Categorical(table['Dataset'])
        table['Program'] = pd.Categorical(table['Program'])
        table['Program2'] = pd.Categorical(table['Program2'])
        return table
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def __init__(self, filename=TABLE_FILENAME):
        MS = SpectralTypeRelations.MainSequence()

        # Read in the table.
        colspecs=[[0,7], [7,14], [14,21], [21,28], [28,34], [34,40], [40,47], [47,55],
                  [55,63], [63,70], [70,78], [78,86], [86,94], [94,103], [103,110],
                  [110,116], [116,122], [122,130], [130,137], [137,144], [144,151],
                  [151,158]]
        mam_df = pd.read_fwf(filename, header=20, colspecs=colspecs, na_values=['...'])[:92]

        # Strip the * from the logAge column. Probably shouldn't but...
        mam_df['logAge'] = mam_df['logAge'].map(lambda s: s.strip('*') if isinstance(s, basestring) else s)

        # Convert everything to floats
        for col in mam_df.columns:
            mam_df[col] = pd.to_numeric(mam_df[col], errors='ignore')

        # Add the spectral type number for interpolation
        mam_df['SpTNum'] = mam_df['SpT'].map(MS.SpT_To_Number)

        self.mam_df = mam_df
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def load_metrics_from_db(db_path, tx_mode, aln_mode):
    """
    Loads the alignment metrics for the mRNA/CDS alignments of transMap/AugustusTM/TMR
    """
    session = tools.sqlInterface.start_session(db_path)
    metrics_table = tools.sqlInterface.tables[aln_mode][tx_mode]['metrics']
    metrics_df = tools.sqlInterface.load_metrics(metrics_table, session)
    # unstack flattens the long-form data structure
    metrics_df = metrics_df.set_index(['AlignmentId', 'classifier']).unstack('classifier')
    metrics_df.columns = [col[1] for col in metrics_df.columns]
    metrics_df = metrics_df.reset_index()
    cols = ['AlnCoverage', 'AlnGoodness', 'AlnIdentity', 'PercentUnknownBases']
    metrics_df[cols] = metrics_df[cols].apply(pd.to_numeric)
    metrics_df['OriginalIntrons'] = metrics_df['OriginalIntrons'].fillna('')
    metrics_df['OriginalIntrons'] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in
                                     metrics_df['OriginalIntrons'].str.split(',').tolist()]
    metrics_df['OriginalIntronsPercent'] = metrics_df['OriginalIntrons'].apply(calculate_vector_support, resolve_nan=1)
    session.close()
    return metrics_df
项目:MAP-IT    作者:alexmarder    | 项目源码 | 文件源码
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'):
    log.info('Creating IP2AS tool.')
    if bgp_compression == 'infer' and bgp.startswith('http'):
        bgp_compression = infer_compression(bgp, 'infer')
    if not isinstance(ixp_prefixes, pd.DataFrame):
        ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set()
    if not isinstance(ixp_asns, pd.DataFrame):
        ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set()
    if not isinstance(bgp, pd.DataFrame):
        bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression)
        bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy()
        bgp['ASN'] = pd.to_numeric(bgp.ASN)
    rt = RoutingTable()
    for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_prefix(asn.item(), address, prefixlen)
    for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False):
        rt.add_ixp(address, prefixlen)
    for prefix in ixp_prefixes:
        rt.add_ixp(prefix)
    rt.add_private()
    rt.add_multicast()
    rt.add_default()
    return rt
项目:cmapPy    作者:cmap    | 项目源码 | 文件源码
def assemble_row_metadata(full_df, num_col_metadata, num_data_rows, num_row_metadata):
    # Extract values
    row_metadata_row_inds = range(num_col_metadata + 1, num_col_metadata + num_data_rows + 1)
    row_metadata_col_inds = range(1, num_row_metadata + 1)
    row_metadata = full_df.iloc[row_metadata_row_inds, row_metadata_col_inds]

    # Create index from the first column of full_df (after the filler block)
    row_metadata.index = full_df.iloc[row_metadata_row_inds, 0]

    # Create columns from the top row of full_df (before cids start)
    row_metadata.columns = full_df.iloc[0, row_metadata_col_inds]

    # Rename the index name and columns name
    row_metadata.index.name = row_index_name
    row_metadata.columns.name = row_header_name

    # Convert metadata to numeric if possible
    row_metadata = row_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))

    return row_metadata
项目:cmapPy    作者:cmap    | 项目源码 | 文件源码
def assemble_col_metadata(full_df, num_col_metadata, num_row_metadata, num_data_cols):

    # Extract values
    col_metadata_row_inds = range(1, num_col_metadata + 1)
    col_metadata_col_inds = range(num_row_metadata + 1, num_row_metadata + num_data_cols + 1)
    col_metadata = full_df.iloc[col_metadata_row_inds, col_metadata_col_inds]

    # Transpose so that samples are the rows and headers are the columns
    col_metadata = col_metadata.T

    # Create index from the top row of full_df (after the filler block)
    col_metadata.index = full_df.iloc[0, col_metadata_col_inds]

    # Create columns from the first column of full_df (before rids start)
    col_metadata.columns = full_df.iloc[col_metadata_row_inds, 0]

    # Rename the index name and columns name
    col_metadata.index.name = column_index_name
    col_metadata.columns.name = column_header_name

    # Convert metadata to numeric if possible
    col_metadata = col_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))

    return col_metadata
项目:pyiem    作者:rheineke    | 项目源码 | 文件源码
def _orderbook_tag_frame(text):
    # This function can be removed if this pandas feature request is implemented
    # https://github.com/pandas-dev/pandas/issues/14608
    table_str = _table_text(text)
    root = etree.fromstring(table_str)
    table_body = root.find('tbody')
    index = []
    data = defaultdict(list)
    # Iterator of tr objects
    qty_path = "td[@class='change-cell quantity']"
    tr_iter = table_body.iter(tag='tr')
    for tr in tr_iter:
        index.append(tr.find(path='td').text.strip())
        # Quantity Held
        pos = pd.to_numeric(tr.find(path=qty_path).attrib['value'])
        data[iem.QUANTITY_HELD].append(pos)
        # Your Bids
        data[iem.YOUR_BIDS].append(_num_open_orders(tr, 'yourBidsCell'))
        # Your Asks
        data[iem.YOUR_ASKS].append(_num_open_orders(tr, 'yourAsksCell'))

    return pd.DataFrame(data=data, index=index)
项目:memex_ad_features    作者:giantoak    | 项目源码 | 文件源码
def apply_ht_scores(dataframe):
    # Load the ht score dataframe
    ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0)
    dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x)))
    # Make the column a numeric column for merging
    dataframe['phone'] = pandas.to_numeric(dataframe['phone'])
    final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True)

    # Drop the content column and drop the index column
    final.drop('content', axis=1, inplace=True)

    if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])):
        lock.acquire()
        print 'lock has been set for file {0}'.format(file)
        final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8')
        lock.release()
        print 'lock has been released for file {0}'.format(file)
    else:
        final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8')
项目:memex_ad_features    作者:giantoak    | 项目源码 | 文件源码
def apply_ht_scores(dataframe):
    # Load the ht score dataframe
    ht_scores = pandas.read_csv('{0}ht_scores.csv'.format(config['result_data']), index_col=0)
    dataframe['phone'] = dataframe['phone'].map(lambda x: re.sub('[^0-9]', '', str(x)))
    # Make the column a numeric column for merging
    #dataframe['phone'] = pandas.to_numeric(dataframe['phone'])
    final = dataframe.merge(ht_scores, how='left', left_on='phone', right_index=True)

    # Drop the content column and drop the index column
    final.drop('content', axis=1, inplace=True)

    if os.path.isfile('{0}ad_chars_final.csv'.format(config['result_data'])):
        lock.acquire()
        print 'lock has been set for file {0}'.format(file)
        final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), mode='a', header=False, encoding='utf-8', index=False)
        lock.release()
    else:
        final.to_csv('{0}ad_chars_final.csv'.format(config['result_data']), header=True, encoding='utf-8', index=False)
项目:PIEFACE    作者:jcumby    | 项目源码 | 文件源码
def makeDataFrame(phases):
    """ Return Pandas DataFrame object, with CIF files as index and ellipsoid parameters as columns (hierarchical by centre atom)"""

    import pandas as pd
    from pieface.readcoords import Crystal

    if isinstance(phases, dict):
        if isinstance( phases[phases.keys()[0]], Crystal):      # We are reading a dict of Crystals: convert to nested dict first
            alldata = makenesteddict(phases)
        elif isinstance( phases[phases.keys()[0]], dict ):      # Looking at a dict of dicts: assume correct for pandas...
            alldata = phases

        d = dict([ (i, pd.DataFrame(alldata[i]).set_index('files')) for i in alldata.keys() ])        # Make dict of DataFrames

        frame = pd.concat(d, axis=1)

        if len(frame.index) == 1:   # We're looking at a single cif file - unstack DataFrame with atoms as index
            return frame.ix[frame.index[0]].unstack().apply(pd.to_numeric, errors='ignore')        # Need to convert back to float/int when unstacking
        else:
            return frame
    else:
        raise TypeError("Unknown data format for conversion to DataFrame (expected dict)")
项目:chemcoord    作者:mcocdawc    | 项目源码 | 文件源码
def _return_appropiate_type(self, selected):
        if isinstance(selected, pd.Series):
            frame = pd.DataFrame(selected).T
            if self._required_cols <= set(frame.columns):
                selected = frame.apply(pd.to_numeric, errors='ignore')
            else:
                return selected

        if (isinstance(selected, pd.DataFrame)
                and self._required_cols <= set(selected.columns)):
            molecule = self.__class__(selected)
            molecule.metadata = self.metadata.copy()
            molecule._metadata = copy.deepcopy(self._metadata)
            return molecule
        else:
            return selected
项目:pybroom    作者:tritemio    | 项目源码 | 文件源码
def _augment_lmfit_modelresult(result):
    """Tidy data values and fitted model from `lmfit.model.ModelResult`.
    """
    columns = ['x', 'data', 'best_fit', 'residual']
    d = pd.DataFrame(index=range(result.ndata), columns=columns)
    for col in columns[1:]:
        d.loc[:, col] = getattr(result, col)

    independent_vars = result.model.independent_vars
    if len(independent_vars) == 1:
        independent_var = independent_vars[0]
    else:
        msg = ('Only 1 independent variable is currently supported.\n'
               'Found independent variables: %s' % str(independent_vars))
        raise NotImplementedError(msg)

    x_array = result.userkws[independent_var]
    d.loc[:, 'x'] = x_array

    if len(result.components) > 1:
        comp_names = [c.name for c in result.components]
        for cname, comp in zip(comp_names, result.components):
            d.loc[:, cname] = comp.eval(x=d.x, **result.values)
    return d.apply(pd.to_numeric, errors='ignore')
项目:equipy    作者:kallinikator    | 项目源码 | 文件源码
def __init__(self, symbol, *args):
        super().__init__()

        self.data = pd.read_csv(open(r"Stock_Data/{}.csv".format(symbol)))
        self.data = self.data.apply(pd.to_numeric, errors="ignore")
        self.data.index = self.data["Quarter end"]
        self.name = symbol

        if self.data["Price"].dtype in (int, float) and self.data["Cumulative dividends per share"].dtype in (int, float):
            self.data["Value"] = self.data["Price"] + self.data["Cumulative dividends per share"]
            # Calculation of the estimated return
            self.data["Estimated Return"] = self.data["Value"].pct_change()
            # Calculation of the standard deviation
            self.data["Standard Deviation"] = self.data["Value"].std()
        else:
            self.complete_pricelist = False
项目:uscensus    作者:nkrishnaswami    | 项目源码 | 文件源码
def __call__(self, fields, geo_for, geo_in=None, cache=NopCache()):
        """Special method to make API object invocable.

        Arguments:
          * fields: list of variables to return.
          * geo_* fields must be given as dictionaries, eg:
            `{'county': '*'}`
          * cache: cache in which to store results. Not cached by default.
        """
        params = {
            'get': ','.join(fields),
            'key': self.key,
            'for': self._geo2str(geo_for),
        }
        if geo_in:
            params['in'] = self._geo2str(geo_in)

        j = fetchjson(self.endpoint, cache, self.session, params=params)
        ret = pd.DataFrame(data=j[1:], columns=j[0])
        for field in fields:
            if self.variables[field].get('predicateType') == 'int':
                ret[field] = pd.to_numeric(ret[field])
        return ret
项目:Test-stock-prediction-algorithms    作者:timestocome    | 项目源码 | 文件源码
def read_data(file_name):

    stock = pd.read_csv(file_name, parse_dates=True, index_col=0)     
    n_samples = len(stock)

    # ditch samples with NAN values
    stock = stock.dropna(axis=0)

    # flip order from newest to oldest to oldest to newest
    #stock = stock.iloc[::-1]

    # trim data
    stock = stock[['Open']]

    # convert object to floats
    stock['Open'] = pd.to_numeric(stock['Open'], errors='coerce')

    # all stock is needed to walk back dates for testing hold out data
    return stock


#############################################################################################
# load and combine stock indexes, matching the dates
项目:erna    作者:fact-project    | 项目源码 | 文件源码
def get_qstat_as_df():
    """Get the current users output of qstat as a DataFrame.
    """
    user = os.environ.get("USER")
    try:
        ret = subprocess.Popen(
            ["qstat", "-u", str(user)],
            stdout=subprocess.PIPE,
        )
        df = pd.read_csv(ret.stdout, delimiter="\s+")
        # drop the first line since it is just one long line
        df = df.drop(df.index[0]).copy()
        # convert objects to numeric otherwise numbers are strings
        df["JOBID"] = pd.to_numeric(df["job-ID"], errors='coerce')
        # df.set_index("JOBID")
        df = df.drop('job-ID', 1)

    except ValueError:
        logger.exception("No jobs in queues for user {}".format(user))
        df = pd.DataFrame()
    return df
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def get_data_from_google(ticker_sym, start, end):
    """ Returns a data frame of data for a given stock between two dates """
    url = "https://www.google.com/finance/historical?q=%s&startdate=%s&enddate=%s&output=csv" % (ticker_sym, start, end)
    s = requests.get(url).content
    df = pd.read_csv(io.StringIO(s.decode('utf-8')))
    df['Date'] = pd.to_datetime(df['Date'])
    df['epoch'] = (df['Date'] - datetime(1970,1,1)).dt.total_seconds() * 1000
    df.set_index('Date')
    df['Adj_Close'] = df['Close'] # google's api doens't provide so just assume it's the same
    cols = ['High', 'Low', 'Volume', 'Open', 'Close', 'Adj_Close']
    for c in cols: # cast columns to numeric
        df[c] = pd.to_numeric(df[c])
    return df.iloc[::-1] # reverse the dataframe so index 0 is the earliest date

#@memoize
#def get_data_for_sym(ticker_sym, start, end):
#    return list(reversed(get_data_for_sym_from_yahoo(ticker_sym, start, end)))
#   #res = StockFeature.select().where(Relationship.from_user == self))
项目:pdVCF    作者:superDross    | 项目源码 | 文件源码
def calc_AB(vcf):
    ''' Calculate allele balance for all samples in a given 
        pdVCF. Also converts DP & GQ to numeric type.

    Args:
        vcf: pdVCF with genotype information extracted

    Notes:
        ONLY WORKS FOR BIALLELIC VARIANTS
    '''
    sam = vcf.columns.levels[0][0]
    vcf[sam,'DP'] = pd.to_numeric(vcf[sam,'DP'].str.replace('.', '0')) # bcftools places '.' in empty fields
    vcf[sam,'GQ'] = pd.to_numeric(vcf[sam,'GQ'].str.replace('.', '0'))
    AD = vcf.xs('AD', level=1, axis=1).unstack().str.split(",", n=2)
    DP = vcf.xs('DP', level=1, axis=1).unstack()
    AB = round(pd.to_numeric(AD.str[1]) / pd.to_numeric(DP), 2)
    vcf[sam, 'AB'] = AB.tolist()
    return vcf
项目:pastas    作者:pastas    | 项目源码 | 文件源码
def update_distances(self):
        """
        Calculate the distances between the observed series and the stresses.

        Returns
        -------
        distances: pandas.DataFrame
            pandas dataframe with the distances between the oseries (index)
            and the stresses (columns).

        """
        # Make sure these are values, even when actually objects.
        xo = pd.to_numeric(self.oseries.x)
        xt = pd.to_numeric(self.stresses.x)
        yo = pd.to_numeric(self.oseries.y)
        yt = pd.to_numeric(self.stresses.y)

        xh, xi = np.meshgrid(xt, xo)
        yh, yi = np.meshgrid(yt, yo)

        self.distances = pd.DataFrame(np.sqrt((xh - xi) ** 2 + (yh - yi) ** 2),
                                      index=self.oseries.index,
                                      columns=self.stresses.index)
项目:soundDB    作者:gjoseph92    | 项目源码 | 文件源码
def parse(self, entry):      
        data = pd.read_csv(str(entry),
                           engine= "c",
                           sep= "\t",
                           parse_dates= False,
                           index_col= [0, 1])

        data.index.names = ["date", "srcid"]

        # Check for AMT bug that adds row of ('nvsplDate', 'Total_All') with all 0s, drop if exists
        if data.index[-1][0] == 'nvsplDate':
            data = data.iloc[:-1, :]

        ## Pandas cannot seem to handle a MultiIndex with dates;
        ## slicing syntax becomes even crazier, and often doesn't even work.
        ## So date conversion is disabled for now.

        # # Convert dates
        # datetimes = data.index.get_level_values('date').to_datetime()
        # data.index.set_levels(datetimes, level= 'date', inplace= True)

        # Ensure MultiIndex sortedness
        data.sortlevel(inplace= True)

        return data.apply(pd.to_numeric, raw= True, errors= "coerce")
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def to_numeric(self,columns):
        '''
        Args
            columns (string or list):
                column names needed to be converted
        Returns
            -
        '''
        if isinstance(columns,str):
            self.data_df[columns] = pd.to_numeric(self.data_df[columns],errors='coerce')
        elif isinstance(columns,list):
            for column in columns:
                self.data_df[column] = pd.to_numeric(self.data_df[column],errors='coerce')

    # rename certain columns
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def to_numeric(self,columns):
        '''
        Args
            columns (string or list):
                column names needed to be converted
        Returns
            -
        '''
        if isinstance(columns,str):
            self.data_df[columns] = pd.to_numeric(self.data_df[columns],errors='coerce')
        elif isinstance(columns,list):
            for column in columns:
                self.data_df[column] = pd.to_numeric(self.data_df[column],errors='coerce')

    # rename certain columns
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def df_coach_bsites(self):
        df_cols = ['site_num', 'c_score', 'cluster_size', 'algorithm',
                   'pdb_template_id', 'pdb_template_chain', 'pdb_ligand',
                   'binding_location_coords', 'c_score_method', 'binding_residues',
                   'ligand_cluster_counts']

        bsites_inf_df = pd.DataFrame.from_records(self.coach_bsites, columns=df_cols).drop_duplicates().reset_index(drop=True)

        if bsites_inf_df.empty:
            log.warning('Empty dataframe')
            return bsites_inf_df
        else:
            bsites_inf_df['c_score'] = pd.to_numeric(bsites_inf_df.c_score, errors='coerce')
            bsites_inf_df['cluster_size'] = pd.to_numeric(bsites_inf_df.cluster_size, errors='coerce')
            return ssbio.utils.clean_df(bsites_inf_df)
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def df_coach_go(self):
        cols = ['go_id', 'go_term', 'c_score']

        go_all_df = pd.DataFrame()

        for go_list in [self.coach_go_mf, self.coach_go_cc, self.coach_go_bp]:
            go_df = pd.DataFrame.from_records(go_list, columns=cols).drop_duplicates().reset_index(drop=True)
            go_df['c_score'] = pd.to_numeric(go_df.c_score, errors='coerce')

            if go_all_df.empty:
                go_all_df = go_df
            else:
                go_all_df.append(go_df)

        return go_all_df
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def parse_coach_ec_df(infile):
    """Parse the EC.dat output file of COACH and return a dataframe of results

    EC.dat contains the predicted EC number and active residues.
        The columns are: PDB_ID, TM-score, RMSD, Sequence identity,
        Coverage, Confidence score, EC number, and Active site residues

    Args:
        infile (str): Path to EC.dat

    Returns:
        DataFrame: Pandas DataFrame summarizing EC number predictions

    """

    ec_df = pd.read_table(infile, delim_whitespace=True,
                          names=['pdb_template', 'tm_score', 'rmsd', 'seq_ident', 'seq_coverage',
                                 'c_score', 'ec_number', 'binding_residues'])

    ec_df['pdb_template_id'] = ec_df['pdb_template'].apply(lambda x: x[:4])
    ec_df['pdb_template_chain'] = ec_df['pdb_template'].apply(lambda x: x[4])

    ec_df = ec_df[['pdb_template_id', 'pdb_template_chain', 'tm_score', 'rmsd',
                   'seq_ident', 'seq_coverage', 'c_score', 'ec_number', 'binding_residues']]
    ec_df['c_score'] = pd.to_numeric(ec_df.c_score, errors='coerce')

    return ec_df
项目:pydov    作者:DOV-Vlaanderen    | 项目源码 | 文件源码
def _get_peilmetingen_df(self):
        """"""
        doc_df = pd.DataFrame(list(self.get_peilmetingen()),
                              columns=["grondwaterlocatie",
                                       "filternummer",
                                       "datum",
                                       "diepte",
                                       "methode",
                                       "betrouwbaarheid"])
        doc_df["datum"] = pd.to_datetime(doc_df["datum"])
        doc_df["diepte"] = pd.to_numeric(doc_df["diepte"])
        doc_df = doc_df.set_index("datum")
        return doc_df
项目:pydov    作者:DOV-Vlaanderen    | 项目源码 | 文件源码
def _get_observaties_df(self):
        """"""
        doc_df = pd.DataFrame(list(self.get_observaties()),
                              columns=["grondwaterlocatie",
                                       "filternummer",
                                       "monsternummer",
                                       "datum",
                                       "parameter",
                                       "waarde",
                                       "eenheid",
                                       "betrouwbaarheid"])
        doc_df["datum"] = pd.to_datetime(doc_df["datum"])
        doc_df["waarde"] = pd.to_numeric(doc_df["waarde"])
        return doc_df
项目:pauvre    作者:conchoecia    | 项目源码 | 文件源码
def filter_fastq_length_meanqual(df, min_len, max_len,
                                 min_mqual, max_mqual):
    querystring = "length >= {0} and meanQual >= {1}".format(min_len, min_mqual)
    if max_len != None:
        querystring += " and length <= {}".format(max_len)
    if max_mqual != None:
        querystring += " and meanQual <= {}".format(max_mqual)
    print("Keeping reads that satisfy: {}".format(querystring), file=stderr)
    filtdf = df.query(querystring)
    #filtdf["length"] = pd.to_numeric(filtdf["length"], errors='coerce')
    #filtdf["meanQual"] = pd.to_numeric(filtdf["meanQual"], errors='coerce')
    return filtdf
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
项目:quantrocket-client    作者:quantrocket-llc    | 项目源码 | 文件源码
def from_csv(cls, filepath_or_buffer):

        # Import pandas lazily since it can take a moment to import
        try:
            import pandas as pd
        except ImportError:
            raise ImportError("pandas must be installed to use ZiplineBacktestResult")

        zipline_result = cls()

        results = pd.read_csv(
            filepath_or_buffer,
            parse_dates=["date"],
            index_col=["dataframe", "index", "date", "column"])["value"]

        # Extract returns
        returns = results.loc["returns"].unstack()
        returns.index = returns.index.droplevel(0).tz_localize("UTC")
        zipline_result.returns = returns["returns"].astype(float)

        # Extract positions
        positions = results.loc["positions"].unstack()
        positions.index = positions.index.droplevel(0).tz_localize("UTC")
        zipline_result.positions = positions.astype(float)

        # Extract transactions
        transactions = results.loc["transactions"].unstack()
        transactions.index = transactions.index.droplevel(0).tz_localize("UTC")
        zipline_result.transactions = transactions.apply(pd.to_numeric, errors='ignore')

        # Extract benchmark returns
        benchmark_returns = results.loc["benchmark"].unstack()
        benchmark_returns.index = benchmark_returns.index.droplevel(0).tz_localize("UTC")
        zipline_result.benchmark_returns = benchmark_returns["benchmark"].astype(float)

        # Extract performance dataframe
        perf = results.loc["perf"].unstack()
        perf.index = perf.index.droplevel(0).tz_localize("UTC")
        zipline_result.perf = perf.apply(pd.to_numeric, errors='ignore')

        return zipline_result
项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def check_null_or_valid(row_data):
    """Function that takes a row of data,
    drops all missing values,
    and checks if all remaining values are greater than or equal to 0
    """
    no_na = row_data.dropna()[1:-1]
    numeric = pd.to_numeric(no_na)
    ge0 = numeric >= 0
    return ge0

# Check whether the first column is 'Life expectancy'
项目:desert-mirage    作者:valentour    | 项目源码 | 文件源码
def eliminate_invalids(df, cols):
    """Eliminate invalid data in ``cols`` of ``df``."""
    numdf = df.drop(cols, axis=1).join(df[cols].apply(pd.to_numeric,
                                                      errors='coerce'))
    numdf = numdf[~numdf[cols].isnull().apply(np.any, axis=1)]
    return numdf
项目:desert-mirage    作者:valentour    | 项目源码 | 文件源码
def partial_convert_only_numerics(df):
    """Convert ``df`` numeric cols and try to coerce any errors encountered."""
    col_dict = df_cols_by_type(df)
    partial_convert = partial(pd.to_numeric, errors='coerce')
    df[col_dict['numeric']].apply(partial_convert)
    return df

# Useful one-liners.
# df.select_dtypes(include=['bool'])
# list(df.select_dtypes(include=['bool']).columns)
项目:PyOnSSET    作者:KTH-dESA    | 项目源码 | 文件源码
def condition_df(self):
        """
        Do any initial data conditioning that may be required.
        """

        logging.info('Ensure that columns that are supposed to be numeric are numeric')
        self.df[SET_GHI] = pd.to_numeric(self.df[SET_GHI], errors='coerce')
        self.df[SET_WINDVEL] = pd.to_numeric(self.df[SET_WINDVEL], errors='coerce')
        self.df[SET_NIGHT_LIGHTS] = pd.to_numeric(self.df[SET_NIGHT_LIGHTS], errors='coerce')
        self.df[SET_ELEVATION] = pd.to_numeric(self.df[SET_ELEVATION], errors='coerce')
        self.df[SET_SLOPE] = pd.to_numeric(self.df[SET_SLOPE], errors='coerce')
        self.df[SET_LAND_COVER] = pd.to_numeric(self.df[SET_LAND_COVER], errors='coerce')
        self.df[SET_GRID_DIST_CURRENT] = pd.to_numeric(self.df[SET_GRID_DIST_CURRENT], errors='coerce')
        self.df[SET_GRID_DIST_PLANNED] = pd.to_numeric(self.df[SET_GRID_DIST_PLANNED], errors='coerce')
        self.df[SET_SUBSTATION_DIST] = pd.to_numeric(self.df[SET_SUBSTATION_DIST], errors='coerce')
        self.df[SET_ROAD_DIST] = pd.to_numeric(self.df[SET_ROAD_DIST], errors='coerce')
        self.df[SET_HYDRO_DIST] = pd.to_numeric(self.df[SET_HYDRO_DIST], errors='coerce')
        self.df[SET_HYDRO] = pd.to_numeric(self.df[SET_HYDRO], errors='coerce')
        self.df[SET_SOLAR_RESTRICTION] = pd.to_numeric(self.df[SET_SOLAR_RESTRICTION], errors='coerce')

        logging.info('Replace null values with zero')
        self.df.fillna(0, inplace=True)

        logging.info('Sort by country, Y and X')
        self.df.sort_values(by=[SET_COUNTRY, SET_Y, SET_X], inplace=True)

        logging.info('Add columns with location in degrees')
        project = Proj('+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')

        def get_x(row):
            x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
            return x

        def get_y(row):
            x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
            return y

        self.df[SET_X_DEG] = self.df.apply(get_x, axis=1)
        self.df[SET_Y_DEG] = self.df.apply(get_y, axis=1)
项目:PyOnSSET    作者:KTH-dESA    | 项目源码 | 文件源码
def condition_df(self):
        """
        Do any initial data conditioning that may be required.
        """

        logging.info('Ensure that columns that are supposed to be numeric are numeric')
        self.df[SET_GHI] = pd.to_numeric(self.df[SET_GHI], errors='coerce')
        self.df[SET_WINDVEL] = pd.to_numeric(self.df[SET_WINDVEL], errors='coerce')
        self.df[SET_NIGHT_LIGHTS] = pd.to_numeric(self.df[SET_NIGHT_LIGHTS], errors='coerce')
        self.df[SET_ELEVATION] = pd.to_numeric(self.df[SET_ELEVATION], errors='coerce')
        self.df[SET_SLOPE] = pd.to_numeric(self.df[SET_SLOPE], errors='coerce')
        self.df[SET_LAND_COVER] = pd.to_numeric(self.df[SET_LAND_COVER], errors='coerce')
        self.df[SET_GRID_DIST_CURRENT] = pd.to_numeric(self.df[SET_GRID_DIST_CURRENT], errors='coerce')
        self.df[SET_GRID_DIST_PLANNED] = pd.to_numeric(self.df[SET_GRID_DIST_PLANNED], errors='coerce')
        self.df[SET_SUBSTATION_DIST] = pd.to_numeric(self.df[SET_SUBSTATION_DIST], errors='coerce')
        self.df[SET_ROAD_DIST] = pd.to_numeric(self.df[SET_ROAD_DIST], errors='coerce')
        self.df[SET_HYDRO_DIST] = pd.to_numeric(self.df[SET_HYDRO_DIST], errors='coerce')
        self.df[SET_HYDRO] = pd.to_numeric(self.df[SET_HYDRO], errors='coerce')
        self.df[SET_SOLAR_RESTRICTION] = pd.to_numeric(self.df[SET_SOLAR_RESTRICTION], errors='coerce')

        logging.info('Replace null values with zero')
        self.df.fillna(0, inplace=True)

        logging.info('Sort by country, Y and X')
        self.df.sort_values(by=[SET_COUNTRY, SET_Y, SET_X], inplace=True)

        logging.info('Add columns with location in degrees')
        project = Proj('+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')

        def get_x(row):
            x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
            return x

        def get_y(row):
            x, y = project(row[SET_X] * 1000, row[SET_Y] * 1000, inverse=True)
            return y

        self.df[SET_X_DEG] = self.df.apply(get_x, axis=1)
        self.df[SET_Y_DEG] = self.df.apply(get_y, axis=1)
项目:VC3D    作者:AlexanderWard1    | 项目源码 | 文件源码
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]):
        """ Take a slice and save it to csv """
        outputFilename += '_slice.csv'

#        # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark
#        tol = 1e-2
#        
#        # Pre allocate empty DF here?
#        slicedData = pd.DataFrame()
#        
#        if not xSlice:
#            # We have some slices along x to make
#            for point in xSlice:
#                # we want to slice at all of these points
#                > xSlice[point] - tol
#            self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 &  self.flowData.transpose()["z"] == 0), "cf"]
#        elif not ySlice:
#            # Slices along y to take
#        elif not zSlice:
#            # And slices aong z

        flowData = self.flowData.apply(pd.to_numeric, errors='ignore')

        slicedData_indices = (flowData["z"] > -0.01) & (flowData["z"] < 0.01)

        slicedData = flowData.loc[slicedData_indices]

        slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0)

        print "Slices saved in", outputFilename
项目:VC3D    作者:AlexanderWard1    | 项目源码 | 文件源码
def saveSlice_CSV(self, outputFilename=outputFilename, xSlice=[], ySlice=[], zSlice=[]):
        """ Take a slice and save it to csv """
        outputFilename += '_slice.csv'

#        # This defines how 'narrow' slice we want. Why am I writing this if ParaView will do it fark
#        tol = 1e-2
#        
#        # Pre allocate empty DF here?
#        slicedData = pd.DataFrame()
#        
#        if not xSlice:
#            # We have some slices along x to make
#            for point in xSlice:
#                # we want to slice at all of these points
#                > xSlice[point] - tol
#            self.flowData.transpose().loc[(self.flowData.transpose()["x"] > 0.599 & self.flowData.transpose()["x"] < 0.601 &  self.flowData.transpose()["z"] == 0), "cf"]
#        elif not ySlice:
#            # Slices along y to take
#        elif not zSlice:
#            # And slices aong z

        flowData = self.flowData.apply(pd.to_numeric, errors='ignore')

        slicedData_indices = (flowData["y"] > 0.598) & (flowData["y"] < 0.602) & (flowData["z"] == 0)

        slicedData = flowData.loc[slicedData_indices]

        slicedData.to_csv(outputFilename, sep=',', index=0, index_label=0)

        print "Slices saved in", outputFilename
项目:singlecell-dash    作者:czbiohub    | 项目源码 | 文件源码
def maybe_to_numeric(series):
    try:
        return pd.to_numeric(series)
    except ValueError:
        return series
项目:SSieve    作者:davidimprovz    | 项目源码 | 文件源码
def createPriceHistoryReport(self, stock):
        """
        Calls get10YrPriceHistory() to package a price history report into a PANDAS dataframe, then cleans and returns the data.

        This function will acquire a price history for the provided symbol, which must be a string and a valid stock symbol
        along with the symbol's exchange, e.g., ('MMM', 'NYSE'). The get10YrPriceHistory() function requires the exchange.

        After the data is loaded, the function adds a Symbol field to the price history for tracking in the database, reindexes 
        and renames some fields, properly formats the dates into datetime fields, and converts prices from strings to floats.

        Returns the report as a PANDAS dataframe if successful, otherwise a tuple (False, error message).

        Example Usage: createPriceHistoryReport(('MMM', 'NYSE'))
        """
        try:
            # get the raw data from morningstar    
            price_history = self.get10YrPriceHistory(stock)

            if isinstance(price_history, pd.DataFrame): # the price_history has to exist, or else return the err msg of the function called

                price_history['Symbol'] = stock[0]
                # reorganize header order
                price_history = price_history.reindex(columns=['Symbol','Date','Open','High','Low','Close','Volume'])
                # rename the Date column for easier processing through SQLite's Date functionality
                price_history.rename(columns={'Date':'Reference'}, inplace=True)
                # convert all dates to ISO formatted yyyy-mm-dd strings
                price_history['Reference'] = price_history['Reference'].apply(lambda x: time.strftime("%Y-%m-%d", time.strptime(x, "%m/%d/%Y")))

                # convert volumes to integers # unicode err on ??? value for some volumes goes to NaN

                price_history['Volume'] = pd.to_numeric(price_history['Volume'].str.replace(',',''), errors='coerce')
                # set index b/f db commit so no duplicate numeric index columns
                price_history.set_index(['Symbol'], inplace=True)

            return price_history

        except Exception as e:
            return (False, e)

    # get10YrPriceHistory
    # ******************* #
项目:actions-for-actions    作者:gsig    | 项目源码 | 文件源码
def load_groundtruth(self):
        gt_labels = pd.read_csv(self.data_path)
        if self.subset is not None:
            mask = [True if x in self.subset else False for x in gt_labels['id'].values]
            gt_labels = gt_labels[mask]
            assert np.any(np.array(mask))
        gt_labels['length'] = pd.to_numeric(gt_labels['length'])
        gt_labels['actions'].fillna('', inplace=True)
        self.gt_labels = gt_labels
项目:georges    作者:chernals    | 项目源码 | 文件源码
def read_madx_tracking(file):
    """Read a MAD-X Tracking onetable=true file to a dataframe."""
    column_names = ['ID', 'TURN', 'X', 'PX', 'Y', 'PY', 'T', 'PT', 'S', 'E']
    data = pd.read_csv(file, skiprows=MADX_TRACKING_SKIP_ROWS, delim_whitespace=True, names=column_names)
    return data.apply(pd.to_numeric, errors="ignore").dropna()
项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def to_dataframe(self, cast_numeric=False):
        df = self._dataframe.copy()

        if cast_numeric:
            df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

        return df