Python pandas 模块,Categorical() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.Categorical()

项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def break_info(self, range=None):
        if range is None:
            range = self.dimension()
        # for discrete, limits != range
        limits = self.limits
        major = self.get_breaks(limits)
        minor = []
        if major is None:
            major = labels = []
        else:
            labels = self.get_labels(major)
            major = pd.Categorical(major.keys())
            major = self.map(major)
        return {'range': range,
                'labels': labels,
                'major': major,
                'minor': minor}
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def map(self, data, layout):
        if not len(data):
            data['PANEL'] = pd.Categorical(
                [],
                categories=layout['PANEL'].cat.categories,
                ordered=True)
            return data

        facet_vals = eval_facet_vars(data, self.vars, self.plot.environment)
        data, facet_vals = add_missing_facets(data, layout,
                                              self.vars, facet_vals)

        # assign each point to a panel
        keys = join_keys(facet_vals, layout, self.vars)
        data['PANEL'] = match(keys['x'], keys['y'], start=1)
        data = data.sort_values('PANEL', kind='mergesort')

        # matching dtype
        data['PANEL'] = pd.Categorical(
            data['PANEL'],
            categories=layout['PANEL'].cat.categories,
            ordered=True)

        data.reset_index(drop=True, inplace=True)
        return data
项目:catalyst    作者:enigmampc    | 项目源码 | 文件源码
def from_categorical(cls, categorical, missing_value=None):
        """
        Create a LabelArray from a pandas categorical.

        Parameters
        ----------
        categorical : pd.Categorical
            The categorical object to convert.
        missing_value : bytes, unicode, or None, optional
            The missing value to use for this LabelArray.

        Returns
        -------
        la : LabelArray
            The LabelArray representation of this categorical.
        """
        return LabelArray(
            categorical,
            missing_value,
            categorical.categories,
        )
项目:catalyst    作者:enigmampc    | 项目源码 | 文件源码
def as_categorical(self, name=None):
        """
        Coerce self into a pandas categorical.

        This is only defined on 1D arrays, since that's all pandas supports.
        """
        if len(self.shape) > 1:
            raise ValueError("Can't convert a 2D array to a categorical.")

        with ignore_pandas_nan_categorical_warning():
            return pd.Categorical.from_codes(
                self.as_int_array(),
                # We need to make a copy because pandas >= 0.17 fails if this
                # buffer isn't writeable.
                self.categories.copy(),
                ordered=False,
                name=name,
            )
项目:sktransformers    作者:TomAugspurger    | 项目源码 | 文件源码
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        is_dask = isinstance(X, dd.DataFrame)
        if is_dask:
            X = X.categorize()

        X = X.copy() if hasattr(X, 'copy') else X
        categories = self.cat_cols_
        for k in categories:
            cat = (categories.get(k, None)
                   if hasattr(categories, 'get')
                   else None)
            ordered = self.ordered.get(k, False)
            # can't use Categorical constructor since dask compat
            if not is_dask:
                X[k] = pd.Categorical(X[k])
            if cat:
                X[k] = X[k].cat.set_categories(cat)
            if ordered:
                X[k] = X[k].cat.as_ordered()
        return X
项目:sktransformers    作者:TomAugspurger    | 项目源码 | 文件源码
def inverse_transform(self, X):
        non_cat = pd.DataFrame(X[:, :len(self.non_cat_columns_)],
                               columns=self.non_cat_columns_)
        cats = []
        for col in self.cat_columns_:
            slice_ = self.cat_blocks_[col]
            categories = self.categories_map_[col]
            ordered = self.ordered_map_[col]

            codes = X[:, slice_].argmax(1)
            series = pd.Series(pd.Categorical.from_codes(
                codes, categories, ordered=ordered
            ), name=col)
            cats.append(series)
        df = pd.concat([non_cat] + cats, axis=1)[self.columns_]
        return df
项目:atropos    作者:jdidion    | 项目源码 | 文件源码
def _get_table(self, column, is_size=True):
        cols = list(range(5))
        cols.append(self.header.index(column))
        header = [self.header[c] for c in cols]
        rows = [
            [row[c] for c in cols]
            for row in self.rows
        ]
        if is_size:
            for row in rows:
                row[5] = parse_size(row[5])
        table = pd.DataFrame.from_records(rows, columns=header)
        table = table.rename(columns={ 
            'prog' : 'Program',
            'prog2' : 'Program2',
            'threads' : 'Threads',
            'dataset' : 'Dataset',
            'qcut' : 'Quality',
        })
        table['Threads'] = pd.to_numeric(table['Threads'])
        table['Dataset'] = pd.Categorical(table['Dataset'])
        table['Program'] = pd.Categorical(table['Program'])
        table['Program2'] = pd.Categorical(table['Program2'])
        return table
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_reindex_dtype(self):
        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(['a', 'c'
                                                                       ])
        tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))

        res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(
            Categorical(['a', 'c']))
        tm.assert_index_equal(res, CategoricalIndex(
            ['a', 'a', 'c'], categories=['a', 'c']), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))

        res, indexer = CategoricalIndex(
            ['a', 'b', 'c', 'a'
             ], categories=['a', 'b', 'c', 'd']).reindex(['a', 'c'])
        tm.assert_index_equal(res, Index(
            ['a', 'a', 'c'], dtype='object'), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))

        res, indexer = CategoricalIndex(
            ['a', 'b', 'c', 'a'],
            categories=['a', 'b', 'c', 'd']).reindex(Categorical(['a', 'c']))
        tm.assert_index_equal(res, CategoricalIndex(
            ['a', 'a', 'c'], categories=['a', 'c']), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_categorical(self):

        # GH 8974
        from pandas import Categorical, Series
        arr = Categorical(list('abc'))
        result = lib.infer_dtype(arr)
        self.assertEqual(result, 'categorical')

        result = lib.infer_dtype(Series(arr))
        self.assertEqual(result, 'categorical')

        arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
        result = lib.infer_dtype(arr)
        self.assertEqual(result, 'categorical')

        result = lib.infer_dtype(Series(arr))
        self.assertEqual(result, 'categorical')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def setUp(self):
        np.random.seed(24)
        self.s = DataFrame({'A': np.random.permutation(range(6))})
        self.df = DataFrame({'A': [0, 1], 'B': np.random.randn(2)})
        self.f = lambda x: x
        self.g = lambda x: x

        def h(x, foo='bar'):
            return pd.Series(['color: %s' % foo], index=x.index, name=x.name)

        self.h = h
        self.styler = Styler(self.df)
        self.attrs = pd.DataFrame({'A': ['color: red', 'color: blue']})
        self.dataframes = [
            self.df,
            pd.DataFrame({'f': [1., 2.], 'o': ['a', 'b'],
                          'c': pd.Categorical(['a', 'b'])})
        ]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output as when one
        # would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        res = StringIO()
        s.to_csv(res)
        exp = StringIO()
        s2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})
        res = StringIO()
        df.to_csv(res)
        exp = StringIO()
        df2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_dataframe_dummies_with_categorical(self):
        df = self.df
        df['cat'] = pd.Categorical(['x', 'y', 'y'])
        result = get_dummies(df, sparse=self.sparse)
        expected = DataFrame({'C': [1, 2, 3],
                              'A_a': [1., 0, 1],
                              'A_b': [0., 1, 0],
                              'B_b': [1., 1, 0],
                              'B_c': [0., 0, 1],
                              'cat_x': [1., 0, 0],
                              'cat_y': [0., 1, 1]})
        expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'
                             ]]
        assert_frame_equal(result, expected)

    # GH12402 Add a new parameter `drop_first` to avoid collinearity
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_unexpected_keyword(self):  # GH8597
        df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe'])
        ca = pd.Categorical([0, 0, 2, 2, 3, np.nan])
        ts = df['joe'].copy()
        ts[2] = np.nan

        with assertRaisesRegexp(TypeError, 'unexpected keyword'):
            df.drop('joe', axis=1, in_place=True)

        with assertRaisesRegexp(TypeError, 'unexpected keyword'):
            df.reindex([1, 0], inplace=True)

        with assertRaisesRegexp(TypeError, 'unexpected keyword'):
            ca.fillna(0, inplace=True)

        with assertRaisesRegexp(TypeError, 'unexpected keyword'):
            ts.fillna(0, in_place=True)

    # See gh-12301
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_describe_typefiltering_category_bool(self):
        df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8),
                        'B_str': ['a', 'b', 'c', 'd'] * 6,
                        'C_bool': [True] * 12 + [False] * 12,
                        'D_num': np.arange(24.) + .5,
                        'E_ts': tm.makeTimeSeries()[:24].index})

        desc = df.describe()
        expected_cols = ['D_num']
        expected = DataFrame(dict((k, df[k].describe())
                                  for k in expected_cols),
                             columns=expected_cols)
        assert_frame_equal(desc, expected)

        desc = df.describe(include=["category"])
        self.assertTrue(desc.columns.tolist() == ["A_cat"])

        # 'all' includes numpy-dtypes + category
        desc1 = df.describe(include="all")
        desc2 = df.describe(include=[np.generic, "category"])
        assert_frame_equal(desc1, desc2)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_setitem(self):

        # int/positional
        c = self.factor.copy()
        c[0] = 'b'
        self.assertEqual(c[0], 'b')
        c[-1] = 'a'
        self.assertEqual(c[-1], 'a')

        # boolean
        c = self.factor.copy()
        indexer = np.zeros(len(c), dtype='bool')
        indexer[0] = True
        indexer[-1] = True
        c[indexer] = 'c'
        expected = Categorical.from_array(['c', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], ordered=True)

        self.assert_categorical_equal(c, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_constructor_unsortable(self):

        # it works!
        arr = np.array([1, 2, 3, datetime.now()], dtype='O')
        factor = Categorical.from_array(arr, ordered=False)
        self.assertFalse(factor.ordered)

        if compat.PY3:
            self.assertRaises(
                TypeError, lambda: Categorical.from_array(arr, ordered=True))
        else:
            # this however will raise as cannot be sorted (on PY3 or older
            # numpies)
            if LooseVersion(np.__version__) < "1.10":
                self.assertRaises(
                    TypeError,
                    lambda: Categorical.from_array(arr, ordered=True))
            else:
                Categorical.from_array(arr, ordered=True)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_is_equal_dtype(self):

        # test dtype comparisons between cats

        c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
        c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
        c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
        self.assertTrue(c1.is_dtype_equal(c1))
        self.assertTrue(c2.is_dtype_equal(c2))
        self.assertTrue(c3.is_dtype_equal(c3))
        self.assertFalse(c1.is_dtype_equal(c2))
        self.assertFalse(c1.is_dtype_equal(c3))
        self.assertFalse(c1.is_dtype_equal(Index(list('aabca'))))
        self.assertFalse(c1.is_dtype_equal(c1.astype(object)))
        self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1)))
        self.assertFalse(c1.is_dtype_equal(
            CategoricalIndex(c1, categories=list('cab'))))
        self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_constructor_with_generator(self):
        # This was raising an Error in isnull(single_val).any() because isnull
        # returned a scalar for a generator
        xrange = range

        exp = Categorical([0, 1, 2])
        cat = Categorical((x for x in [0, 1, 2]))
        self.assertTrue(cat.equals(exp))
        cat = Categorical(xrange(3))
        self.assertTrue(cat.equals(exp))

        # This uses xrange internally
        from pandas.core.index import MultiIndex
        MultiIndex.from_product([range(5), ['a', 'b', 'c']])

        # check that categories accept generators and sequences
        cat = pd.Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
        self.assertTrue(cat.equals(exp))
        cat = pd.Categorical([0, 1, 2], categories=xrange(3))
        self.assertTrue(cat.equals(exp))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_empty_print(self):
        factor = Categorical([], ["a", "b", "c"])
        expected = ("[], Categories (3, object): [a, b, c]")
        # hack because array_repr changed in numpy > 1.6.x
        actual = repr(factor)
        self.assertEqual(actual, expected)

        self.assertEqual(expected, actual)
        factor = Categorical([], ["a", "b", "c"], ordered=True)
        expected = ("[], Categories (3, object): [a < b < c]")
        actual = repr(factor)
        self.assertEqual(expected, actual)

        factor = Categorical([], [])
        expected = ("[], Categories (0, object): []")
        self.assertEqual(expected, repr(factor))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_categories_assigments(self):
        s = pd.Categorical(["a", "b", "c", "a"])
        exp = np.array([1, 2, 3, 1])
        s.categories = [1, 2, 3]
        self.assert_numpy_array_equal(s.__array__(), exp)
        self.assert_numpy_array_equal(s.categories, np.array([1, 2, 3]))

        # lengthen
        def f():
            s.categories = [1, 2, 3, 4]

        self.assertRaises(ValueError, f)

        # shorten
        def f():
            s.categories = [1, 2]

        self.assertRaises(ValueError, f)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_ordered_api(self):
        # GH 9347
        cat1 = pd.Categorical(["a", "c", "b"], ordered=False)
        self.assertTrue(cat1.categories.equals(Index(['a', 'b', 'c'])))
        self.assertFalse(cat1.ordered)

        cat2 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'],
                              ordered=False)
        self.assertTrue(cat2.categories.equals(Index(['b', 'c', 'a'])))
        self.assertFalse(cat2.ordered)

        cat3 = pd.Categorical(["a", "c", "b"], ordered=True)
        self.assertTrue(cat3.categories.equals(Index(['a', 'b', 'c'])))
        self.assertTrue(cat3.ordered)

        cat4 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'],
                              ordered=True)
        self.assertTrue(cat4.categories.equals(Index(['b', 'c', 'a'])))
        self.assertTrue(cat4.ordered)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_remove_categories(self):
        cat = Categorical(["a", "b", "c", "a"], ordered=True)
        old = cat.copy()
        new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"],
                          ordered=True)

        # first inplace == False
        res = cat.remove_categories("c")
        self.assert_categorical_equal(cat, old)
        self.assert_categorical_equal(res, new)

        res = cat.remove_categories(["c"])
        self.assert_categorical_equal(cat, old)
        self.assert_categorical_equal(res, new)

        # inplace == True
        res = cat.remove_categories("c", inplace=True)
        self.assert_categorical_equal(cat, new)
        self.assertIsNone(res)

        # removal is not in categories
        def f():
            cat.remove_categories(["c"])

        self.assertRaises(ValueError, f)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_shift(self):
        # GH 9416
        cat = pd.Categorical(['a', 'b', 'c', 'd', 'a'])

        # shift forward
        sp1 = cat.shift(1)
        xp1 = pd.Categorical([np.nan, 'a', 'b', 'c', 'd'])
        self.assert_categorical_equal(sp1, xp1)
        self.assert_categorical_equal(cat[:-1], sp1[1:])

        # shift back
        sn2 = cat.shift(-2)
        xp2 = pd.Categorical(['c', 'd', 'a', np.nan, np.nan],
                             categories=['a', 'b', 'c', 'd'])
        self.assert_categorical_equal(sn2, xp2)
        self.assert_categorical_equal(cat[2:], sn2[:-2])

        # shift by zero
        self.assert_categorical_equal(cat, cat.shift(0))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_dtypes(self):

        # GH8143
        index = ['cat', 'obj', 'num']
        cat = pd.Categorical(['a', 'b', 'c'])
        obj = pd.Series(['a', 'b', 'c'])
        num = pd.Series([1, 2, 3])
        df = pd.concat([pd.Series(cat), obj, num], axis=1, keys=index)

        result = df.dtypes == 'object'
        expected = Series([False, True, False], index=index)
        tm.assert_series_equal(result, expected)

        result = df.dtypes == 'int64'
        expected = Series([False, False, True], index=index)
        tm.assert_series_equal(result, expected)

        result = df.dtypes == 'category'
        expected = Series([True, False, False], index=index)
        tm.assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_codes_dtypes(self):

        # GH 8453
        result = Categorical(['foo', 'bar', 'baz'])
        self.assertTrue(result.codes.dtype == 'int8')

        result = Categorical(['foo%05d' % i for i in range(400)])
        self.assertTrue(result.codes.dtype == 'int16')

        result = Categorical(['foo%05d' % i for i in range(40000)])
        self.assertTrue(result.codes.dtype == 'int32')

        # adding cats
        result = Categorical(['foo', 'bar', 'baz'])
        self.assertTrue(result.codes.dtype == 'int8')
        result = result.add_categories(['foo%05d' % i for i in range(400)])
        self.assertTrue(result.codes.dtype == 'int16')

        # removing cats
        result = result.remove_categories(['foo%05d' % i for i in range(300)])
        self.assertTrue(result.codes.dtype == 'int8')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_reshaping(self):

        p = tm.makePanel()
        p['str'] = 'foo'
        df = p.to_frame()
        df['category'] = df['str'].astype('category')
        result = df['category'].unstack()

        c = Categorical(['foo'] * len(p.major_axis))
        expected = DataFrame({'A': c.copy(),
                              'B': c.copy(),
                              'C': c.copy(),
                              'D': c.copy()},
                             columns=Index(list('ABCD'), name='minor'),
                             index=p.major_axis.set_names('major'))
        tm.assert_frame_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_reindex(self):

        index = pd.date_range('20000101', periods=3)

        # reindexing to an invalid Categorical
        s = Series(['a', 'b', 'c'], dtype='category')
        result = s.reindex(index)
        expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
                                      categories=['a', 'b', 'c']))
        expected.index = index
        tm.assert_series_equal(result, expected)

        # partial reindexing
        expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b',
                                                                     'c']))
        expected.index = [1, 2]
        result = s.reindex([1, 2])
        tm.assert_series_equal(result, expected)

        expected = Series(Categorical(
            values=['c', np.nan], categories=['a', 'b', 'c']))
        expected.index = [2, 3]
        result = s.reindex([2, 3])
        tm.assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_nan_handling(self):

        # Nans are represented as -1 in labels
        s = Series(Categorical(["a", "b", np.nan, "a"]))
        self.assert_numpy_array_equal(s.cat.categories, np.array(["a", "b"]))
        self.assert_numpy_array_equal(s.values.codes, np.array([0, 1, -1, 0]))

        # If categories have nan included, the label should point to that
        # instead
        with tm.assert_produces_warning(FutureWarning):
            s2 = Series(Categorical(
                ["a", "b", np.nan, "a"], categories=["a", "b", np.nan]))
        self.assert_numpy_array_equal(s2.cat.categories, np.array(
            ["a", "b", np.nan], dtype=np.object_))
        self.assert_numpy_array_equal(s2.values.codes, np.array([0, 1, 2, 0]))

        # Changing categories should also make the replaced category np.nan
        s3 = Series(Categorical(["a", "b", "c", "a"]))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            s3.cat.categories = ["a", "b", np.nan]
        self.assert_numpy_array_equal(s3.cat.categories, np.array(
            ["a", "b", np.nan], dtype=np.object_))
        self.assert_numpy_array_equal(s3.values.codes, np.array([0, 1, 2, 0]))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_sequence_like(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df['grade'] = Categorical(df['raw_grade'])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.iteritems():
            str(s)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_describe(self):

        # Categoricals should not show up together with numerical columns
        result = self.cat.describe()
        self.assertEqual(len(result.columns), 1)

        # In a frame, describe() for the cat should be the same as for string
        # arrays (count, unique, top, freq)

        cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
                          ordered=True)
        s = Series(cat)
        result = s.describe()
        expected = Series([4, 2, "b", 3],
                          index=['count', 'unique', 'top', 'freq'])
        tm.assert_series_equal(result, expected)

        cat = pd.Series(pd.Categorical(["a", "b", "c", "c"]))
        df3 = pd.DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
        res = df3.describe()
        self.assert_numpy_array_equal(res["cat"].values, res["s"].values)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_repr(self):
        a = pd.Series(pd.Categorical([1, 2, 3, 4]))
        exp = u("0    1\n1    2\n2    3\n3    4\n" +
                "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")

        self.assertEqual(exp, a.__unicode__())

        a = pd.Series(pd.Categorical(["a", "b"] * 25))
        exp = u("0     a\n1     b\n" + "     ..\n" + "48    a\n49    b\n" +
                "dtype: category\nCategories (2, object): [a, b]")
        with option_context("display.max_rows", 5):
            self.assertEqual(exp, repr(a))

        levs = list("abcdefghijklmnopqrstuvwxyz")
        a = pd.Series(pd.Categorical(
            ["a", "b"], categories=levs, ordered=True))
        exp = u("0    a\n1    b\n" + "dtype: category\n"
                "Categories (26, object): [a < b < c < d ... w < x < y < z]")
        self.assertEqual(exp, a.__unicode__())
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_categorical_series_repr(self):
        s = pd.Series(pd.Categorical([1, 2, 3]))
        exp = """0    1
1    2
2    3
dtype: category
Categories (3, int64): [1, 2, 3]"""

        self.assertEqual(repr(s), exp)

        s = pd.Series(pd.Categorical(np.arange(10)))
        exp = """0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: category
Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""

        self.assertEqual(repr(s), exp)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_categorical_series_repr_ordered(self):
        s = pd.Series(pd.Categorical([1, 2, 3], ordered=True))
        exp = """0    1
1    2
2    3
dtype: category
Categories (3, int64): [1 < 2 < 3]"""

        self.assertEqual(repr(s), exp)

        s = pd.Series(pd.Categorical(np.arange(10), ordered=True))
        exp = """0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: category
Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""

        self.assertEqual(repr(s), exp)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_categorical_series_repr_period_ordered(self):
        idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5)
        s = pd.Series(pd.Categorical(idx, ordered=True))
        exp = """0   2011-01-01 09:00
1   2011-01-01 10:00
2   2011-01-01 11:00
3   2011-01-01 12:00
4   2011-01-01 13:00
dtype: category
Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
                         2011-01-01 13:00]"""

        self.assertEqual(repr(s), exp)

        idx = pd.period_range('2011-01', freq='M', periods=5)
        s = pd.Series(pd.Categorical(idx, ordered=True))
        exp = """0   2011-01
1   2011-02
2   2011-03
3   2011-04
4   2011-05
dtype: category
Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""

        self.assertEqual(repr(s), exp)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_mode(self):
        s = Series(Categorical([1, 1, 2, 4, 5, 5, 5],
                               categories=[5, 4, 3, 2, 1], ordered=True))
        res = s.mode()
        exp = Series(Categorical([5], categories=[
                     5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
        s = Series(Categorical([1, 1, 1, 4, 5, 5, 5],
                               categories=[5, 4, 3, 2, 1], ordered=True))
        res = s.mode()
        exp = Series(Categorical([5, 1], categories=[
                     5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
        s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
                               ordered=True))
        res = s.mode()
        exp = Series(Categorical([], categories=[5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1])
        self.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, '(0, 25]'], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, '(0, 25]'], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_append(self):
        cat = pd.Categorical(["a", "b"], categories=["a", "b"])
        vals = [1, 2]
        df = pd.DataFrame({"cats": cat, "vals": vals})
        cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"])
        vals2 = [1, 2, 1, 2]
        exp = pd.DataFrame({"cats": cat2,
                            "vals": vals2}, index=pd.Index([0, 1, 0, 1]))

        res = df.append(df)
        tm.assert_frame_equal(exp, res)

        # Concat should raise if the two categoricals do not have the same
        # categories
        cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"])
        vals3 = [1, 2]
        df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3})

        def f():
            df.append(df_wrong_categories)

        self.assertRaises(ValueError, f)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_pickle_v0_14_1(self):

        # we have the name warning
        # 10482
        with tm.assert_produces_warning(UserWarning):
            cat = pd.Categorical(values=['a', 'b', 'c'],
                                 categories=['a', 'b', 'c', 'd'],
                                 name='foobar', ordered=False)
        pickle_path = os.path.join(tm.get_data_path(),
                                   'categorical_0_14_1.pickle')
        # This code was executed once on v0.14.1 to generate the pickle:
        #
        # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
        #                   name='foobar')
        # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
        #
        self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_concat_categorical(self):
        # See GH 10177
        df1 = pd.DataFrame(
            np.arange(18, dtype='int64').reshape(6,
                                                 3), columns=["a", "b", "c"])

        df2 = pd.DataFrame(
            np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"])
        df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two",
                                             "two", "one"]))

        df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True)

        df_expected = pd.DataFrame(
            {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
             'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan,
                   np.nan, np.nan],
             'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]})
        df_expected['h'] = pd.Series(pd.Categorical(
            [None, None, None, None, None, None, "one", "one", "two", "one",
             "two", "two", "one"]))

        tm.assert_frame_equal(df_expected, df_concat)
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_categorical(model_and_func):
    formula = 'y ~ 1 + d + x1'
    y = np.random.randn(1000)
    x1 = np.random.randn(1000)
    d = np.random.randint(0, 4, 1000)
    d = pd.Categorical(d)
    data = pd.DataFrame({'y': y, 'x1': x1, 'd': d})
    data['Intercept'] = 1.0
    model, func = model_and_func
    mod = model.from_formula(formula, data)
    res3 = mod.fit()
    res2 = func(formula, data).fit()
    res = model(data.y, data[['Intercept', 'x1', 'd']], None, None).fit()

    assert_allclose(res.rsquared, res2.rsquared)
    assert_allclose(res2.rsquared, res3.rsquared)
    assert mod.formula == formula
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_mixed_input(data):
    y = PanelData(data.y)
    nt = y.values2d.shape[0]
    effects = np.random.randint(0, 5, size=nt)
    prim = ['a', 'b', 'c', 'd', 'e']
    temp = {'effect.0': pd.Categorical(pd.Series(effects, index=y.index)),
            'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index)}
    effects = pd.DataFrame(temp, index=y.index)
    mod = PanelOLS(data.y, data.x, other_effects=effects)
    mod.fit()

    clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2))
    temp = {}
    prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase))))
    temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index)
    temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index)
    clusters = pd.DataFrame(temp, index=y.index)
    mod.fit(cov_type='clustered', clusters=clusters)
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_general_demean_oneway(panel):
    y = PanelData(panel)
    dm1 = y.demean('entity')
    g = pd.DataFrame(y.entity_ids, index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    dm1 = y.demean('time')
    g = pd.DataFrame(y.time_ids, index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g)
    g = pd.Categorical(g.iloc[:, 0])
    d = pd.get_dummies(g)
    dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
    assert_allclose(dm1, dm2.values2d)
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_general_demean_twoway(panel):
    y = PanelData(panel)
    dm1 = y.demean('both')
    g = pd.DataFrame(y.entity_ids, index=y.index)
    g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g)
    g1 = pd.Categorical(g.iloc[:, 0])
    d1 = pd.get_dummies(g1)
    g2 = pd.Categorical(g.iloc[:, 1])
    d2 = pd.get_dummies(g2, drop_first=True)
    d = np.c_[d1, d2]
    dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
    assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_general_weighted_demean_oneway(panel):
    y = PanelData(panel)
    weights = pd.DataFrame(
        np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
    w = PanelData(weights)

    dm1 = y.demean('entity', weights=w)
    g = PanelData(pd.DataFrame(y.entity_ids, index=y.index))
    dm2 = y.general_demean(g, w)
    assert_allclose(dm1.values2d, dm2.values2d)

    dm1 = y.demean('time', weights=w)
    g = PanelData(pd.DataFrame(y.time_ids, index=y.index))
    dm2 = y.general_demean(g, w)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = PanelData(pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape),
                               index=y.index))
    dm2 = y.general_demean(g, w)
    g = pd.Categorical(g.dataframe.iloc[:, 0])
    d = pd.get_dummies(g)
    wd = np.sqrt(w.values2d) * d
    wy = np.sqrt(w.values2d) * y.values2d
    dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
    assert_allclose(dm1, dm2.values2d, atol=1e-14)
项目:linearmodels    作者:bashtage    | 项目源码 | 文件源码
def test_general_unit_weighted_demean_twoway(panel):
    np.random.seed(12345)
    y = PanelData(panel)
    weights = pd.DataFrame(
        np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
    w = PanelData(weights)

    dm1 = y.demean('both', weights=w)
    g = pd.DataFrame(y.entity_ids, index=y.index)
    g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
    dm2 = y.general_demean(g, weights=w)
    assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d),
                    atol=1e-7)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g, weights=w)
    g1 = pd.Categorical(g.iloc[:, 0])
    d1 = pd.get_dummies(g1)
    g2 = pd.Categorical(g.iloc[:, 1])
    d2 = pd.get_dummies(g2, drop_first=True)
    d = np.c_[d1, d2]
    wd = np.sqrt(w.values2d) * d
    wy = np.sqrt(w.values2d) * y.values2d
    dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
    assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def filtered_table(table,
        v_gene_coverage,  # at least
        j_gene_coverage,  # at least
        v_gene_evalue,  # at most
    ):
    """
    Discard the following rows in the table:
    - no J assigned
    - stop codon found
    - V gene coverage less than v_gene_coverage
    - J gene coverage less than j_gene_coverage
    - V gene E-value greater than v_gene_evalue

    Return the filtered table.
    """
    stats = FilteringStatistics()
    stats.n = len(table)
    # Both V and J must be assigned
    # (Note V_gene and J_gene columns use empty strings instead of NA)
    filtered = table[(table['V_gene'] != '') & (table['J_gene'] != '')][:]
    stats.vjassigned = len(filtered)
    filtered['V_gene'] = pd.Categorical(filtered['V_gene'])

    # Filter out sequences that have a stop codon
    filtered = filtered[filtered.stop == 'no']
    stats.stop = len(filtered)

    # Filter out sequences with a too low V gene hit E-value
    filtered = filtered[filtered.V_evalue <= v_gene_evalue]
    stats.v_evalue = len(filtered)

    # Filter out sequences with too low V gene coverage
    filtered = filtered[filtered.V_covered >= v_gene_coverage]
    stats.v_coverage = len(filtered)

    # Filter out sequences with too low J gene coverage
    filtered = filtered[filtered.J_covered >= j_gene_coverage]
    stats.j_coverage = len(filtered)

    return filtered, stats
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def generate_agents(df, country, population):
    """
    Generate a dataframe of agents for a country where population
    is the number of agents to be created.
    """
    def max_value(attribute):
        return df[attribute].max()
    # Turn this on for truly random output from each process.
    # pid = mp.current_process()._identity[0]
    rand = np.random.mtrand.RandomState(0)
    country_data = df[df.index == country].to_dict("records")[0]
    gdp = country_data["GDP"]
    income_array = gdp / 10 * rand.chisquare(10, population).astype('float32')
    unemployment_rate = float(country_data["Unemployment"] / 100.0)
    employment_array = rand.choice([True, False], population,
                                   p=[1 - unemployment_rate, unemployment_rate])
    attachment_array = (country_data["Fertility"] *
                        rand.triangular(0.0, 0.5, 1.0, population) /
                        max_value("Fertility")).astype('float32')
    frame = pd.DataFrame({
        "Country": pd.Categorical([country] * population, list(df.index)),
        "Income": income_array,
        "Employed": employment_array.astype('bool'),
        "Attachment": attachment_array,
        "Location": pd.Categorical([country] * population, list(df.index)),
        "Migration": 0,
    }, columns=world_columns)
    return frame
项目:autonomio    作者:autonomio    | 项目源码 | 文件源码
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def get_scale(self, gg):
        """
        Create a scale
        """
        # This method does some introspection to save users from
        # scale mismatch error. This could happen when the
        # aesthetic is mapped to a categorical but the limits
        # are not provided in categorical form. We only handle
        # the case where the mapping uses an expression to
        # conver to categorical e.g `aes(color='factor(cyl)')`.
        # However if `'cyl'` column is a categorical and the
        # mapping is `aes(color='cyl')`, that will result in
        # an error. If later case proves common enough then we
        # could inspect the data and be clever based on that too!!
        ae = self.aesthetic
        series = pd.Series(self.limits)
        ae_values = []

        # Look through all the mappings for this aesthetic,
        # if we detect any factor stuff then we convert the
        # limits data to categorical so that the right scale
        # can be choosen. This should take care of the most
        # common use cases.
        for layer in gg.layers:
            with suppress(KeyError):
                value = layer.mapping[ae]
                if isinstance(value, six.string_types):
                        ae_values.append(value)

        for value in ae_values:
            if ('factor(' in value or
                    'Categorical(' in value):
                series = pd.Categorical(series)
                break
        return make_scale(self.aesthetic,
                          series,
                          limits=self.limits,
                          trans=self.trans)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_inverse_transform(self):
        de = dpp.DummyEncoder()
        df = dd.from_pandas(pd.DataFrame({"A": np.arange(10),
                                          "B": pd.Categorical(['a'] * 4 +
                                                              ['b'] * 6)}),
                            npartitions=2)
        de.fit(df)
        assert_eq_df(df, de.inverse_transform(de.transform(df)))
        assert_eq_df(df, de.inverse_transform(de.transform(df).values))