Python pandas 模块,Series() 实例源码


项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def aggregate_ohlcv_panel(self,
        Convert an OHLCV Panel into a DataFrame by aggregating each field's
        frame into a Series.
        vals = ohlcv_panel
        if isinstance(ohlcv_panel, pd.Panel):
            vals = ohlcv_panel.values
            items = ohlcv_panel.items
            minor_axis = ohlcv_panel.minor_axis

        data = [
            for field in fields
        return np.array(data)
项目:py-hadoop-tutorial    作者:hougs    | 项目源码 | 文件源码
def to_series(tuples):
    """Transforms a list of tuples of the form (date, count) in to a pandas
    series indexed by dt.
    cleaned_time_val_tuples = [tuple for tuple in tuples if not (
        tuple[0] is pd.NaT or tuple[1] is None)]
    if len(cleaned_time_val_tuples) > 0:
        # change list of tuples ie [(a1, b1), (a2, b2), ...] into
        # tuple of lists ie ([a1, a2, ...], [b1, b2, ...])
        unzipped_cleaned_time_values = zip(*cleaned_time_val_tuples)
        # just being explicit about what these are
        counts = unzipped_cleaned_time_values[1]
        timestamps = unzipped_cleaned_time_values[0]
        # Create the series with a sorted index.
        ret_val = pd.Series(counts, index=timestamps).sort_index()
        ret_val = None
    return ret_val

# In[ ]:
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        The same arguments as for pandas.Series

        In order to create XSeries of any data_type, data argument must be a pythons list.
        For example, to create XSeries of pandas.Series, pass data should be
        data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
        super(XSeries, self).__init__(*args, **kwargs)

        data = kwargs.get('data')
        if data is None:
            data = args[0]

        check_result, data_type = _check_all_elements_have_the_same_property(data, type)
        if not check_result:
            raise ValueError('Not all elements the same type')

        if data_type is not None:
            self._data_type = data_type
            self._data_type = type(data._values[0])
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def __init__(self, dictionary=None, **kwargs):
        :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
        self.dictionary = dictionary

        accepted_types = [
            pd.Series, list, np.array, tuple

        def bag_of_words_transform_function(corpus):
            counter = Counter(corpus)
            for el in self.dictionary:
                if counter.get(el) is None:
                    counter[el] = 0
            return counter

        super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_mean_transformer():
    s1 = XSeries([
    s2 = XSeries([

    tr = MeanSeriesTransformer()
    tr =

    transformed_s = tr.transform(s2)

    assert transformed_s.shape[0] == 3
    assert type(transformed_s) == XSeries
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_mean_transformer_data_frame():
    s1 = XSeries([
    s2 = XSeries([

    df = XDataFrame({
        's1': s1,
        's2': s2

    tr = MeanSeriesTransformer()
        tr =
        assert False
        assert True
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_dataframe_data_types():
    s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
                      pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
    s2 = XSeries([1, 2, 3])
    s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
    s4 = XSeries(['f', 's', 't'])

    df = XDataFrame({
        'first_col': s1,
        'second_col': s2,
        'third_col': s3,
        'fourth_col': s4

    assert df['first_col'].data_type == pd.Series
    assert df['second_col'].data_type == np.int64
    assert df['third_col'].data_type == dict
    assert df['fourth_col'].data_type == str

    assert type(df[['first_col']]) == XDataFrame
    assert type(df[['first_col', 'second_col']]) == XDataFrame
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_dataframe_sub_frame_data_types():
    s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
                      pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
    s2 = XSeries([1, 2, 3])
    s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
    s4 = XSeries(['f', 's', 't'])

    df = XDataFrame({
        'first_col': s1,
        'second_col': s2,
        'third_col': s3,
        'fourth_col': s4

    sub_df = df.loc[:2]

    assert type(sub_df) == XDataFrame
    assert sub_df['first_col'].data_type == pd.Series
    assert sub_df['second_col'].data_type == np.int64
    assert sub_df['third_col'].data_type == dict
    assert sub_df['fourth_col'].data_type == str

    assert type(sub_df[['first_col']]) == XDataFrame
    assert type(sub_df[['first_col', 'second_col']]) == XDataFrame
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_series_replace_element():
    s = XSeries([
        pd.Series([1, 2, 3], index=['a', 'b', 'c']),
        pd.Series([4, 5, 6], index=['d', 'e', 'g'])
    ], name='MySuperSeries')

        s[0] = 111
        assert False
        assert True

        s[0] = pd.Series(np.random.normal(size=100))
        assert True
        assert False
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_naming():
    X = XSeries([
        pd.Series(np.random.normal(0, 1, 100), name='X')
    df = XDataFrame({
        'X': X

    dataframe_transformer = XDataFrameTransformer({
        'X': [TimeSeriesTransformer()]
    transformed_df = dataframe_transformer.transform(df)

    for col_name in transformed_df.columns:
        assert col_name.startswith('X_TimeSeriesTransformer')
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_multiple_transformers_for_one_column():
    X = XSeries([
        pd.Series(np.random.normal(0, 1, 100), name='X')
    df = XDataFrame({
        'X': X

    dataframe_transformer = XDataFrameTransformer({
        'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()]
    transformed_df = dataframe_transformer.transform(df)

    for col_name in transformed_df.columns:
        assert col_name.startswith('X_TimeSeriesTransformer') or \
               col_name.startswith('X_IdentityTransformer') or \
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def test_ts_fresh_chain():
    s1 = XSeries([
        pd.Series(np.random.normal(0, 1, 20))
        for _ in range(10)
    ], name='X')

    pipe = PipeLineChain([
        ('mean shift', TimeSeriesWindowTransformer()),
        ('ts fresh step', TsFreshSeriesTransformer())
    transformed_df = pipe.transform(s1)

    # print(transformed_df.head())

    assert type(transformed_df) == XDataFrame
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_bfill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.bfill().values
        test = bfill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.bfill().values
        test = bfill(df.values)
        assert_almost_equal(correct, test)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_ffill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.ffill().values
        test = ffill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.ffill().values
        test = ffill(df.values)
        assert_almost_equal(correct, test)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_conversion_to_df(self, df, infer_timestamps):

        events_by_sid = {0: df}
        loader = EventDataSetLoader(

        if infer_timestamps:
            expected = pd.Series(index=[dtx[0]] * 10, data=dtx,
            expected = pd.Series(index=dtx, data=dtx,
        # Check that index by first given date has been added
项目:scikit-dataaccess    作者:MITHaystack    | 项目源码 | 文件源码
def getAntennaLogs():
        Retrieve information about antenna changes

        @return dictionary of antenna changes
        store_location = data_util.getDataLocation('ngl_gps')
        store = pd.HDFStore(store_location, 'r')
        logs_df = store['ngl_steps']

        metadata = DataFetcher.getStationMetadata()

        logs_dict = OrderedDict()

        for station in metadata.index:
            offset_dates = logs_df[logs_df['Station']==station].index.unique()
            offset_dates = pd.Series(offset_dates)
            logs_dict[station] = offset_dates

        return logs_dict
项目:didi_competition    作者:Heipiao    | 项目源码 | 文件源码
def remove_error_poi_each_line(line_data):
    ## from 1 to len(..), because the first one is district hash
    ### why I need a temp_line_data here!!!!
    ### Please see the property of the remove() function

    standard_style = re.compile(r"\d+#\d+:\d+")

    line_data = list(line_data[0])
    temp_line_data = line_data.copy()
    for poi_in_line in temp_line_data:
        if len(poi_in_line) == 32: # this is the district hash
        if not re.match(standard_style, poi_in_line):
    return pd.Series([line_data])

# the input line_data is a serise!!
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_series_append():

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    appending = dgd.from_pygdf(frags[0], npartitions=1)
    for frag in frags[1:]:
        appending = appending.append(frag)

    appended = appending.compute().to_pandas()
    assert isinstance(appended, pd.Series)
    np.testing.assert_array_equal(appended, df.x)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def test_take(nelem, nparts):

    # # Use unique index range as the sort may not be stable-ordering
    x = np.random.randint(0, nelem, size=nelem)
    y = np.random.random(nelem)

    selected = np.random.randint(0, nelem - 1, size=nelem // 2)

    df = pd.DataFrame({'x': x, 'y': y})

    ddf = dd.from_pandas(df, npartitions=nparts)
    dgdf = dgd.from_dask_dataframe(ddf)
    out = dgdf.take(gd.Series(selected), npartitions=5)
    got = out.compute().to_pandas()

    expect = df.take(selected)
    assert 1 < out.npartitions <= 5
    np.testing.assert_array_equal(got.index, np.arange(len(got)))
    np.testing.assert_array_equal(got.x, expect.x)
    np.testing.assert_array_equal(got.y, expect.y)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def set_index(self, index, drop=True, sorted=False):
        """Set new index.

        index : str or Series
            If a ``str`` is provided, it is used as the name of the
            column to be made into the index.
            If a ``Series`` is provided, it is used as the new index
        drop : bool
            Whether the first original index column is dropped.
        sorted : bool
            Whether the new index column is already sorted.
        if not drop:
            raise NotImplementedError('drop=False not supported yet')

        if isinstance(index, str):
            return self._set_index_raw(index, drop=drop, sorted=sorted)
        elif isinstance(index, Series):
            indexname = '__dask_gdf.index'
            df = self.assign(**{indexname: index})
            return df._set_index_raw(indexname, drop=drop, sorted=sorted)
            raise TypeError('cannot set_index from {}'.format(type(index)))
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def connect_actors(actor_frame, connectivity_sets, connectivity_column):
    :param actor_frame:
    :param connectivity_sets:
    :param connectivity_column:


    same_actors = {
        'ccason': [3, 14, 15], 'clipka': [4, 5, 13],
        'wfpokorny': [11, 17], 'anshuarya': [0],
        'bentsm': [1], 'cbarton': [2], 'dbodor': [6],
        'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9],
        'selvik': [10], 'wverhelst': [12], 'gryken': [16],
        'github': [18]}
    actor_frame = connect_actors(actor_frame, same_actors, 'actor_id')
    connectivity = {}
    for actor_id, connectivity_set in connectivity_sets.items():
        for actor in connectivity_set:
            connectivity[actor] = actor_id
    actor_frame[connectivity_column] = su.categorize(pd.Series(connectivity))
    return actor_frame
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def _compute_author_similarity(self, paired_authors):
        def row_similarity(row):
            same_email = row.author_email == row.author_email_other
            name_similarity = fuzz.token_set_ratio(row.author_name,
            email_name_similarity = fuzz.ratio(row.email_name,
            name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
            return pd.Series(
                [same_email, name_similarity, email_name_similarity,

        newcols = paired_authors.apply(row_similarity, axis=1)
        newcols.columns = ['same_email', 'name_similarity',
                           'email_name_similarity', 'name_to_email_similarity']
        newdf = paired_authors.join(newcols)
        return newdf
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def SMA(Series, N, M=1):

    ret = []
    i = 1
    length = len(Series)
    # ??X????? nan ?
    while i < length:
        if np.isnan(Series[i]):
            i += 1
    preY = Series[i]  # Y'
    while i < length:
        Y = (M * Series[i] + (N - M) * preY) / float(N)
        preY = Y
        i += 1
    return pd.Series(ret)
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_indicator_dpo(data, N=20, M=6):

    _dpo = pd.Series(data) - pd.Series(data).rolling(N / 2 + 1).mean()
    _madpo = pd.Series(_dpo).rolling(M).mean()
    return _dpo, _madpo
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def stats_desc(self,store_key,cond):
            store_key (string):
                define which data to be analyzed in the workspace
            cond (string):
                sample observation
            descriptive statistics
        datas = list()
        for ite_file in
        datas = pd.Series(datas)
        return datas.describe()

    # one way ANOVA
    # for scalar value usage only
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_add(self,column,added_info):
            column (string):
                the column name to be played with
            added_info (string, int, float or pandas.DataFrame):
                The information to be added to the selected column can be string, int, float, or 
        if isinstance(added_info,str):
            self.data_df[column] = self.data_df[column] + self.data_df[added_info]
        elif isinstance(added_info,(int,float)):
            self.data_df[column] = self.data_df[column] + added_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] + added_info

    # This function performs minus to a given column
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_minus(self,column,minus_info):
            column (string):
                the column name to be played with
            minus_info (string, int, float or pandas.DataFrame):
                information to be subtracted from the selected column
        if isinstance(minus_info,str):
            self.data_df[column] = self.data_df[column] - self.data_df[minus_info]
        elif isinstance(minus_info,(int,float)):
            self.data_df[column] = self.data_df[column] - minus_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] - added_info

    # This function multiplys the selected column with certain factor
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_multiply(self,column,multiply_info):
            column (string):
                the column name to be played with
            multiply_info (string, int, float or pandas.DataFrame):
                information to be used for multiplying
        if isinstance(multiply_info,str):
            self.data_df[column] = self.data_df[column] * self.data_df[multiply_info]
        elif isinstance(multiply_info,(int,float)):
            self.data_df[column] = self.data_df[column] * multiply_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] * added_info

    # This function divides the selected column by certain factor
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_division(self,column,division_info):
            column (string):
                the column name to be played with
            division_info (string, int, float or pandas.DataFrame):
                information to be used for dividing
        if isinstance(division_info,str):
            self.data_df[column] = self.data_df[column] / self.data_df[division_info]
        elif isinstance(division_info,(int,float)):
            self.data_df[column] = self.data_df[column] / float(division_info)
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] / added_info

    # delete certain trials in the data table
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def stats_desc(self,store_key,cond):
            store_key (string):
                define which data to be analyzed in the workspace
            cond (string):
                sample observation
            descriptive statistics
        datas = list()
        for ite_file in list(
        datas = pd.Series(datas)
        return datas.describe()

    # one way ANOVA
    # for scalar value usage only
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_add(self,column,added_info):
            column (string):
                the column name to be played with
            added_info (string, int, float or pandas.DataFrame):
                The information to be added to the selected column can be string, int, float, or 
        if isinstance(added_info,str):
            self.data_df[column] = self.data_df[column] + self.data_df[added_info]
        elif isinstance(added_info,(int,float)):
            self.data_df[column] = self.data_df[column] + added_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] + added_info

    # This function performs minus to a given column
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_minus(self,column,minus_info):
            column (string):
                the column name to be played with
            minus_info (string, int, float or pandas.DataFrame):
                information to be subtracted from the selected column
        if isinstance(minus_info,str):
            self.data_df[column] = self.data_df[column] - self.data_df[minus_info]
        elif isinstance(minus_info,(int,float)):
            self.data_df[column] = self.data_df[column] - minus_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] - added_info

    # This function multiplys the selected column with certain factor
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_multiply(self,column,multiply_info):
            column (string):
                the column name to be played with
            multiply_info (string, int, float or pandas.DataFrame):
                information to be used for multiplying
        if isinstance(multiply_info,str):
            self.data_df[column] = self.data_df[column] * self.data_df[multiply_info]
        elif isinstance(multiply_info,(int,float)):
            self.data_df[column] = self.data_df[column] * multiply_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] * added_info

    # This function divides the selected column by certain factor
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def df_division(self,column,division_info):
            column (string):
                the column name to be played with
            division_info (string, int, float or pandas.DataFrame):
                information to be used for dividing
        if isinstance(division_info,str):
            self.data_df[column] = self.data_df[column] / self.data_df[division_info]
        elif isinstance(division_info,(int,float)):
            self.data_df[column] = self.data_df[column] / division_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] / added_info

    # delete certain trials in the data table
项目:tdx_formula    作者:woodylee1974    | 项目源码 | 文件源码
def test_ABS():
    text = """
    param1 = {
        'X': pd.Series([-2, -1, -0.5, 9.8]),
        'RESULT': pd.Series([2, 1, 0.5, 9.8])

    param2 = {
        'X': pd.Series([-2, -1, 0, 9]),
        'RESULT': pd.Series([2, 1, 0, 9])

    params = [param1, param2]
    testfunc(text, params)
项目:tdx_formula    作者:woodylee1974    | 项目源码 | 文件源码
def test_SMA():
    text = """
       SMA(X, M, N);

    param1 = {
        'X': pd.Series([10.2, 30.9, 30.48, 39.34, 43.3, 45.9, 30.48, 39.34, 45.9, 30.48, 39.34]),
        'M': 5,
        'N': 3,
        'RESULT': pd.Series(
            [10.2, 24.985714, 28.507692, 35.177833, 40.101552, 43.594930, 35.713058, 37.890650, 42.697520, 35.366239,

    params = [param1]
    testfunc(text, params, True, True)
项目:tdx_formula    作者:woodylee1974    | 项目源码 | 文件源码
def CROSS(self, param):
        if not isinstance(param[0], pd.core.series.Series) and not isinstance(param[1], pd.core.series.Series):
            print('Invalid data type is detected.')
            return False

        if not isinstance(param[0], pd.core.series.Series):
            x1 = param[0]
            x2 = param[0]
            y1 = param[1].shift(1)
            y2 = param[1]

        if not isinstance(param[1], pd.core.series.Series):
            x1 = param[0].shift(1)
            x2 = param[0]
            y1 = param[1]
            y2 = param[1]

        if isinstance(param[0], pd.core.series.Series) and isinstance(param[1], pd.core.series.Series):
            x1 = param[0].shift(1)
            x2 = param[0]
            y1 = param[1].shift(1)
            y2 = param[1]

        return (x1 <= y1) & (x2 > y2)
项目:tdx_formula    作者:woodylee1974    | 项目源码 | 文件源码
def MAX(self, param):
        if isinstance(param[0], pd.core.series.Series):
            df = pd.DataFrame(index = param[0].index)
        elif isinstance(param[1], pd.core.series.Series):
            df = pd.DataFrame(index = param[1].index)
            df = None

        if df is None:
            return np.max(param)

        df['A'] = param[0]
        df['B'] = param[1]
        def callback(row):
            if row['A'] >= row['B']:
                return row['A']
                return row['B']
        result = df.apply(callback, axis = 1, reduce = True)
        return result
项目:tdx_formula    作者:woodylee1974    | 项目源码 | 文件源码
def MIN(self, param):
        if isinstance(param[0], pd.core.series.Series):
            df = pd.DataFrame(index = param[0].index)
        elif isinstance(param[1], pd.core.series.Series):
            df = pd.DataFrame(index = param[1].index)
            df = None

        if df is None:
            return np.max(param)
        df['A'] = param[0]
        df['B'] = param[1]
        def callback(row):
            if row['A'] <= row['B']:
                return row['A']
                return row['B']
        result = df.apply(callback, axis = 1, reduce = True)
        return result
项目:hidi    作者:VEVO    | 项目源码 | 文件源码
def setUp(self):
        scores = pd.Series(np.ones(8), dtype=np.float32)
        np_data = np.array([
            [1, 'a'],
            [2, 'b'],
            [4, 'a'],
            [3, 'c'],
            [3, 'b'],
            [5, 'c'],
            [4, 'c'],
            [1, 'b'],
        col_labels = ['item_id', 'link_id']
        self.input_df = pd.DataFrame(data=np_data, columns=col_labels)
        self.input_df['score'] = scores
        self.sparse = SparseTransform()
        self.out = self.sparse.transform(self.input_df)
项目:powerAI    作者:dreameng28    | 项目源码 | 文件源码
def rise_rate(df):
    date1_2 = df[record_date].map(lambda x: str2time(x)).max()
    date1_1 = datetime.datetime(date1_2.year, date1_2.month, 1).date()
    grouped1 = DataView(df).filter_by_record_date2(date1_1, date1_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean()
    from dateutil.relativedelta import relativedelta
    date2_1 = date1_1 - relativedelta(months=+1)
    date2_2 = date1_2 - relativedelta(months=+1)
    grouped2 = DataView(df).filter_by_record_date2(date2_1, date2_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean()
    print(date1_1,date1_2, date2_1, date2_2)
    user_rise_rate = pd.Series(map(lambda x, y: float(x - y) / y, grouped1[power_consumption], grouped2[power_consumption])) = 'user_rise_rate'
    return grouped1[[user_id]].join(user_rise_rate)

# ?????
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def create_agents(self, generator):
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        self.agents = pd.concat(
                           np.array_split(country_array, self.processes * self.splits))
        self.agents.index = range(len(self.agents))
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def create_agents(self, generator):
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        self.agents = pd.concat(
                           np.array_split(country_array, self.processes * self.splits))
        self.agents.index = range(len(self.agents))
项目:PyGPS    作者:gregstarr    | 项目源码 | 文件源码
def minScalErr(stec,el,z,thisBias):
    this determines the slope of the vTEC vs. Elevation line, which
    should be minimized in the minimum scalloping technique for
    receiver bias removal
        stec - time indexed Series of slant TEC values
        el - corresponding elevation values, also Series
        z - mapping function values to convert to vTEC from entire file, may
            contain nans, Series
        thisBias - the bias to be tested and minimized

    intel=np.asarray(el[stec.index],int) # bin the elevation values into int
    zmap = z[stec.index]
                              /zmap[intel==i])) for i in np.unique(intel) if i>30])

    return np.polyfit(c[:,0],c[:,1],1)[0]
项目:scheduled-bots    作者:SuLab    | 项目源码 | 文件源码
def generate_summary(df):
    level_counts = df.Level.value_counts().to_dict()
    zlist = list(zip(*[('<a href="#info">Items Processed Succesfully</a>', level_counts.get('INFO', 0)),
                       ('<a href="#warning">Items Skipped Due to a Warning</a>', level_counts.get('WARNING', 0)),
                       ('<a href="#error">Items Skipped Due to an Error</a>', level_counts.get('ERROR', 0))]))
    level_counts = pd.Series(zlist[1], index=zlist[0]) = "Count"

    info_counts = df.query("Level == 'INFO'").Message.value_counts().to_dict()
    zlist = list(zip(*[('No Action', info_counts.get('SKIP', 0)),
                       ('Update', info_counts.get('UPDATE', 0)),
                       ('Create', info_counts.get('CREATE', 0))]))
    info_counts = pd.Series(zlist[1], index=zlist[0]) = "Count"

    warning_counts = df.query("Level == 'WARNING'")['Msg Type'].value_counts() = "Count"
    error_counts = df.query("Level == 'ERROR'")['Msg Type'].value_counts() = "Count"
    return level_counts, info_counts, warning_counts, error_counts
项目:micom    作者:resendislab    | 项目源码 | 文件源码
def _format_min_growth(min_growth, species):
    """Format min_growth into a pandas series.

    min_growth : positive float or array-like object.
        The minimum growth rate for each individual in the community. Either
        a single value applied to all individuals or one value for each.
    species : array-like
        The ID for each individual model in the community.

        A pandas Series mapping each individual to its minimum growth rate.

        min_growth = float(min_growth)
    except (TypeError, ValueError):
        if len(min_growth) != len(species):
            raise ValueError(
                "min_growth must be single value or an array-like "
                "object with an entry for each species in the model.")
    return pd.Series(min_growth, species)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def clean_data(self, df, is_with_MICE=0):
        df = df.copy()
        if df.isnull().sum().sum() > 0:
            if is_with_MICE:
                # Imputation using MICE
                numerical_features_names = self.extract_numerical_features(df)
                df.loc[:, tuple(numerical_features_names)] = self.estimate_by_mice(df[numerical_features_names])
                if any(tuple(df.columns == 'y')):
                    df = df.dropna()
                    df = df.dropna(1)
                    TwoSigmaFinModTools._feature_names_num = pd.Series(data=np.intersect1d(
                        TwoSigmaFinModTools._feature_names_num.values, df.columns), dtype=object)
        TwoSigmaFinModTools._numerical_feature_names = TwoSigmaFinModTools.extract_numerical_features(df)
        return df
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def predict_job(job_list):
    """Assign a classification to a url"""
    # TODO: Add case where len is 1 or 0....
    job_list = [job for j in job_list for job in j]
    new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
    new_job_list = [' '.join(job) for job in new_job_list]
    vect = CountVectorizer()
    x_series = pd.Series(X)
    X_train_dtm = vect.fit_transform(x_series)
    y_train = pd.Series(y)
    job_list_series = pd.Series(new_job_list)
    job_list_dtm = vect.transform(job_list_series)
    nb = MultinomialNB(), y_train)
    y_pred = nb.predict(job_list_dtm)
    # for i in range(len(job_list)):
    #     print(job_list[i], y_pred[i])
    return y_pred

# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
项目:inqbus.rainflow    作者:Inqbus    | 项目源码 | 文件源码
def count_pairs(data):
    df = pd.DataFrame(data)

    start, target = df.columns.tolist()

    # first we create groups for each pair and take size of each group as count.
    # counts is a pandas.Series with the pairs as index
    counts = df.groupby([start, target]).size()

    # than we remove duplicate pairs from original dateframe,
    # so length and counts are equal in size
    df = df.drop_duplicates()

    # reset index to values of pairs to fit index of counts
    df.set_index([0, 1], inplace=True, drop=False)

    # now we append the counts as column to the original data
    df[2] = pd.Series(counts.values, index=counts.index)

    # just cast pandas-dataframe back to numpy 2d-array usable for following
    # steps
    array = df.values
    return array
项目:bambi    作者:bambinos    | 项目源码 | 文件源码
def _hpd_interval(self, x, width):
        Code adapted from pymc3.stats.calc_min_interval:
        x = np.sort(x)
        n = len(x)

        interval_idx_inc = int(np.floor(width * n))
        n_intervals = n - interval_idx_inc
        interval_width = x[interval_idx_inc:] - x[:n_intervals]

        if len(interval_width) == 0:
            raise ValueError('Too few elements for interval calculation')

        min_idx = np.argmin(interval_width)
        hdi_min = x[min_idx]
        hdi_max = x[min_idx + interval_idx_inc]

        index = ['hpd{}_{}'.format(width, x) for x in ['lower', 'upper']]
        return pd.Series([hdi_min, hdi_max], index=index)