针对pulse-transit的工具
This commit is contained in:
		
							
								
								
									
										365
									
								
								dist/client/pandas/tests/arrays/categorical/test_analytics.py
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										365
									
								
								dist/client/pandas/tests/arrays/categorical/test_analytics.py
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,365 @@
 | 
			
		||||
import re
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
from pandas.compat import PYPY
 | 
			
		||||
 | 
			
		||||
from pandas import (
 | 
			
		||||
    Categorical,
 | 
			
		||||
    CategoricalDtype,
 | 
			
		||||
    Index,
 | 
			
		||||
    NaT,
 | 
			
		||||
    Series,
 | 
			
		||||
    date_range,
 | 
			
		||||
)
 | 
			
		||||
import pandas._testing as tm
 | 
			
		||||
from pandas.api.types import is_scalar
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestCategoricalAnalytics:
 | 
			
		||||
    @pytest.mark.parametrize("aggregation", ["min", "max"])
 | 
			
		||||
    def test_min_max_not_ordered_raises(self, aggregation):
 | 
			
		||||
        # unordered cats have no min/max
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "d"], ordered=False)
 | 
			
		||||
        msg = f"Categorical is not ordered for operation {aggregation}"
 | 
			
		||||
        agg_func = getattr(cat, aggregation)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(TypeError, match=msg):
 | 
			
		||||
            agg_func()
 | 
			
		||||
 | 
			
		||||
        ufunc = np.minimum if aggregation == "min" else np.maximum
 | 
			
		||||
        with pytest.raises(TypeError, match=msg):
 | 
			
		||||
            ufunc.reduce(cat)
 | 
			
		||||
 | 
			
		||||
    def test_min_max_ordered(self, index_or_series_or_array):
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "d"], ordered=True)
 | 
			
		||||
        obj = index_or_series_or_array(cat)
 | 
			
		||||
        _min = obj.min()
 | 
			
		||||
        _max = obj.max()
 | 
			
		||||
        assert _min == "a"
 | 
			
		||||
        assert _max == "d"
 | 
			
		||||
 | 
			
		||||
        assert np.minimum.reduce(obj) == "a"
 | 
			
		||||
        assert np.maximum.reduce(obj) == "d"
 | 
			
		||||
        # TODO: raises if we pass axis=0  (on Index and Categorical, not Series)
 | 
			
		||||
 | 
			
		||||
        cat = Categorical(
 | 
			
		||||
            ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
 | 
			
		||||
        )
 | 
			
		||||
        obj = index_or_series_or_array(cat)
 | 
			
		||||
        _min = obj.min()
 | 
			
		||||
        _max = obj.max()
 | 
			
		||||
        assert _min == "d"
 | 
			
		||||
        assert _max == "a"
 | 
			
		||||
        assert np.minimum.reduce(obj) == "d"
 | 
			
		||||
        assert np.maximum.reduce(obj) == "a"
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize(
 | 
			
		||||
        "categories,expected",
 | 
			
		||||
        [
 | 
			
		||||
            (list("ABC"), np.NaN),
 | 
			
		||||
            ([1, 2, 3], np.NaN),
 | 
			
		||||
            pytest.param(
 | 
			
		||||
                Series(date_range("2020-01-01", periods=3), dtype="category"),
 | 
			
		||||
                NaT,
 | 
			
		||||
                marks=pytest.mark.xfail(
 | 
			
		||||
                    reason="https://github.com/pandas-dev/pandas/issues/29962"
 | 
			
		||||
                ),
 | 
			
		||||
            ),
 | 
			
		||||
        ],
 | 
			
		||||
    )
 | 
			
		||||
    @pytest.mark.parametrize("aggregation", ["min", "max"])
 | 
			
		||||
    def test_min_max_ordered_empty(self, categories, expected, aggregation):
 | 
			
		||||
        # GH 30227
 | 
			
		||||
        cat = Categorical([], categories=categories, ordered=True)
 | 
			
		||||
 | 
			
		||||
        agg_func = getattr(cat, aggregation)
 | 
			
		||||
        result = agg_func()
 | 
			
		||||
        assert result is expected
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize(
 | 
			
		||||
        "values, categories",
 | 
			
		||||
        [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
 | 
			
		||||
    )
 | 
			
		||||
    @pytest.mark.parametrize("skipna", [True, False])
 | 
			
		||||
    @pytest.mark.parametrize("function", ["min", "max"])
 | 
			
		||||
    def test_min_max_with_nan(self, values, categories, function, skipna):
 | 
			
		||||
        # GH 25303
 | 
			
		||||
        cat = Categorical(values, categories=categories, ordered=True)
 | 
			
		||||
        result = getattr(cat, function)(skipna=skipna)
 | 
			
		||||
 | 
			
		||||
        if skipna is False:
 | 
			
		||||
            assert result is np.nan
 | 
			
		||||
        else:
 | 
			
		||||
            expected = categories[0] if function == "min" else categories[2]
 | 
			
		||||
            assert result == expected
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("function", ["min", "max"])
 | 
			
		||||
    @pytest.mark.parametrize("skipna", [True, False])
 | 
			
		||||
    def test_min_max_only_nan(self, function, skipna):
 | 
			
		||||
        # https://github.com/pandas-dev/pandas/issues/33450
 | 
			
		||||
        cat = Categorical([np.nan], categories=[1, 2], ordered=True)
 | 
			
		||||
        result = getattr(cat, function)(skipna=skipna)
 | 
			
		||||
        assert result is np.nan
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("method", ["min", "max"])
 | 
			
		||||
    def test_deprecate_numeric_only_min_max(self, method):
 | 
			
		||||
        # GH 25303
 | 
			
		||||
        cat = Categorical(
 | 
			
		||||
            [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
 | 
			
		||||
        )
 | 
			
		||||
        with tm.assert_produces_warning(expected_warning=FutureWarning):
 | 
			
		||||
            getattr(cat, method)(numeric_only=True)
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("method", ["min", "max"])
 | 
			
		||||
    def test_numpy_min_max_raises(self, method):
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "b"], ordered=False)
 | 
			
		||||
        msg = (
 | 
			
		||||
            f"Categorical is not ordered for operation {method}\n"
 | 
			
		||||
            "you can use .as_ordered() to change the Categorical to an ordered one"
 | 
			
		||||
        )
 | 
			
		||||
        method = getattr(np, method)
 | 
			
		||||
        with pytest.raises(TypeError, match=re.escape(msg)):
 | 
			
		||||
            method(cat)
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
 | 
			
		||||
    @pytest.mark.parametrize("method", ["min", "max"])
 | 
			
		||||
    def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "b"], ordered=True)
 | 
			
		||||
        msg = (
 | 
			
		||||
            f"the '{kwarg}' parameter is not supported in the pandas implementation "
 | 
			
		||||
            f"of {method}"
 | 
			
		||||
        )
 | 
			
		||||
        if kwarg == "axis":
 | 
			
		||||
            msg = r"`axis` must be fewer than the number of dimensions \(1\)"
 | 
			
		||||
        kwargs = {kwarg: 42}
 | 
			
		||||
        method = getattr(np, method)
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            method(cat, **kwargs)
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
 | 
			
		||||
    def test_numpy_min_max_axis_equals_none(self, method, expected):
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "b"], ordered=True)
 | 
			
		||||
        method = getattr(np, method)
 | 
			
		||||
        result = method(cat, axis=None)
 | 
			
		||||
        assert result == expected
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize(
 | 
			
		||||
        "values,categories,exp_mode",
 | 
			
		||||
        [
 | 
			
		||||
            ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
 | 
			
		||||
            ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
 | 
			
		||||
            ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
 | 
			
		||||
            ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
 | 
			
		||||
            ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
 | 
			
		||||
            ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
 | 
			
		||||
        ],
 | 
			
		||||
    )
 | 
			
		||||
    def test_mode(self, values, categories, exp_mode):
 | 
			
		||||
        s = Categorical(values, categories=categories, ordered=True)
 | 
			
		||||
        msg = "Use Series.mode instead"
 | 
			
		||||
        with tm.assert_produces_warning(FutureWarning, match=msg):
 | 
			
		||||
            res = s.mode()
 | 
			
		||||
        exp = Categorical(exp_mode, categories=categories, ordered=True)
 | 
			
		||||
        tm.assert_categorical_equal(res, exp)
 | 
			
		||||
 | 
			
		||||
    def test_searchsorted(self, ordered):
 | 
			
		||||
        # https://github.com/pandas-dev/pandas/issues/8420
 | 
			
		||||
        # https://github.com/pandas-dev/pandas/issues/14522
 | 
			
		||||
 | 
			
		||||
        cat = Categorical(
 | 
			
		||||
            ["cheese", "milk", "apple", "bread", "bread"],
 | 
			
		||||
            categories=["cheese", "milk", "apple", "bread"],
 | 
			
		||||
            ordered=ordered,
 | 
			
		||||
        )
 | 
			
		||||
        ser = Series(cat)
 | 
			
		||||
 | 
			
		||||
        # Searching for single item argument, side='left' (default)
 | 
			
		||||
        res_cat = cat.searchsorted("apple")
 | 
			
		||||
        assert res_cat == 2
 | 
			
		||||
        assert is_scalar(res_cat)
 | 
			
		||||
 | 
			
		||||
        res_ser = ser.searchsorted("apple")
 | 
			
		||||
        assert res_ser == 2
 | 
			
		||||
        assert is_scalar(res_ser)
 | 
			
		||||
 | 
			
		||||
        # Searching for single item array, side='left' (default)
 | 
			
		||||
        res_cat = cat.searchsorted(["bread"])
 | 
			
		||||
        res_ser = ser.searchsorted(["bread"])
 | 
			
		||||
        exp = np.array([3], dtype=np.intp)
 | 
			
		||||
        tm.assert_numpy_array_equal(res_cat, exp)
 | 
			
		||||
        tm.assert_numpy_array_equal(res_ser, exp)
 | 
			
		||||
 | 
			
		||||
        # Searching for several items array, side='right'
 | 
			
		||||
        res_cat = cat.searchsorted(["apple", "bread"], side="right")
 | 
			
		||||
        res_ser = ser.searchsorted(["apple", "bread"], side="right")
 | 
			
		||||
        exp = np.array([3, 5], dtype=np.intp)
 | 
			
		||||
        tm.assert_numpy_array_equal(res_cat, exp)
 | 
			
		||||
        tm.assert_numpy_array_equal(res_ser, exp)
 | 
			
		||||
 | 
			
		||||
        # Searching for a single value that is not from the Categorical
 | 
			
		||||
        with pytest.raises(TypeError, match="cucumber"):
 | 
			
		||||
            cat.searchsorted("cucumber")
 | 
			
		||||
        with pytest.raises(TypeError, match="cucumber"):
 | 
			
		||||
            ser.searchsorted("cucumber")
 | 
			
		||||
 | 
			
		||||
        # Searching for multiple values one of each is not from the Categorical
 | 
			
		||||
        msg = (
 | 
			
		||||
            "Cannot setitem on a Categorical with a new category, "
 | 
			
		||||
            "set the categories first"
 | 
			
		||||
        )
 | 
			
		||||
        with pytest.raises(TypeError, match=msg):
 | 
			
		||||
            cat.searchsorted(["bread", "cucumber"])
 | 
			
		||||
        with pytest.raises(TypeError, match=msg):
 | 
			
		||||
            ser.searchsorted(["bread", "cucumber"])
 | 
			
		||||
 | 
			
		||||
    def test_unique(self, ordered):
 | 
			
		||||
        # GH38140
 | 
			
		||||
        dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
 | 
			
		||||
 | 
			
		||||
        # categories are reordered based on value when ordered=False
 | 
			
		||||
        cat = Categorical(["a", "b", "c"], dtype=dtype)
 | 
			
		||||
        res = cat.unique()
 | 
			
		||||
        tm.assert_categorical_equal(res, cat)
 | 
			
		||||
 | 
			
		||||
        cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
 | 
			
		||||
        res = cat.unique()
 | 
			
		||||
        tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
 | 
			
		||||
 | 
			
		||||
        cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
 | 
			
		||||
        res = cat.unique()
 | 
			
		||||
        exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
 | 
			
		||||
        tm.assert_categorical_equal(res, exp_cat)
 | 
			
		||||
 | 
			
		||||
        # nan must be removed
 | 
			
		||||
        cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
 | 
			
		||||
        res = cat.unique()
 | 
			
		||||
        exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
 | 
			
		||||
        tm.assert_categorical_equal(res, exp_cat)
 | 
			
		||||
 | 
			
		||||
    def test_unique_index_series(self, ordered):
 | 
			
		||||
        # GH38140
 | 
			
		||||
        dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
 | 
			
		||||
 | 
			
		||||
        c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
 | 
			
		||||
        # Categorical.unique sorts categories by appearance order
 | 
			
		||||
        # if ordered=False
 | 
			
		||||
        exp = Categorical([3, 1, 2], dtype=dtype)
 | 
			
		||||
        tm.assert_categorical_equal(c.unique(), exp)
 | 
			
		||||
 | 
			
		||||
        tm.assert_index_equal(Index(c).unique(), Index(exp))
 | 
			
		||||
        tm.assert_categorical_equal(Series(c).unique(), exp)
 | 
			
		||||
 | 
			
		||||
        c = Categorical([1, 1, 2, 2], dtype=dtype)
 | 
			
		||||
        exp = Categorical([1, 2], dtype=dtype)
 | 
			
		||||
        tm.assert_categorical_equal(c.unique(), exp)
 | 
			
		||||
        tm.assert_index_equal(Index(c).unique(), Index(exp))
 | 
			
		||||
        tm.assert_categorical_equal(Series(c).unique(), exp)
 | 
			
		||||
 | 
			
		||||
    def test_shift(self):
 | 
			
		||||
        # GH 9416
 | 
			
		||||
        cat = Categorical(["a", "b", "c", "d", "a"])
 | 
			
		||||
 | 
			
		||||
        # shift forward
 | 
			
		||||
        sp1 = cat.shift(1)
 | 
			
		||||
        xp1 = Categorical([np.nan, "a", "b", "c", "d"])
 | 
			
		||||
        tm.assert_categorical_equal(sp1, xp1)
 | 
			
		||||
        tm.assert_categorical_equal(cat[:-1], sp1[1:])
 | 
			
		||||
 | 
			
		||||
        # shift back
 | 
			
		||||
        sn2 = cat.shift(-2)
 | 
			
		||||
        xp2 = Categorical(
 | 
			
		||||
            ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
 | 
			
		||||
        )
 | 
			
		||||
        tm.assert_categorical_equal(sn2, xp2)
 | 
			
		||||
        tm.assert_categorical_equal(cat[2:], sn2[:-2])
 | 
			
		||||
 | 
			
		||||
        # shift by zero
 | 
			
		||||
        tm.assert_categorical_equal(cat, cat.shift(0))
 | 
			
		||||
 | 
			
		||||
    def test_nbytes(self):
 | 
			
		||||
        cat = Categorical([1, 2, 3])
 | 
			
		||||
        exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
 | 
			
		||||
        assert cat.nbytes == exp
 | 
			
		||||
 | 
			
		||||
    def test_memory_usage(self):
 | 
			
		||||
        cat = Categorical([1, 2, 3])
 | 
			
		||||
 | 
			
		||||
        # .categories is an index, so we include the hashtable
 | 
			
		||||
        assert 0 < cat.nbytes <= cat.memory_usage()
 | 
			
		||||
        assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
 | 
			
		||||
 | 
			
		||||
        cat = Categorical(["foo", "foo", "bar"])
 | 
			
		||||
        assert cat.memory_usage(deep=True) > cat.nbytes
 | 
			
		||||
 | 
			
		||||
        if not PYPY:
 | 
			
		||||
            # sys.getsizeof will call the .memory_usage with
 | 
			
		||||
            # deep=True, and add on some GC overhead
 | 
			
		||||
            diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
 | 
			
		||||
            assert abs(diff) < 100
 | 
			
		||||
 | 
			
		||||
    def test_map(self):
 | 
			
		||||
        c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
 | 
			
		||||
        result = c.map(lambda x: x.lower())
 | 
			
		||||
        exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
 | 
			
		||||
        tm.assert_categorical_equal(result, exp)
 | 
			
		||||
 | 
			
		||||
        c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
 | 
			
		||||
        result = c.map(lambda x: x.lower())
 | 
			
		||||
        exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
 | 
			
		||||
        tm.assert_categorical_equal(result, exp)
 | 
			
		||||
 | 
			
		||||
        result = c.map(lambda x: 1)
 | 
			
		||||
        # GH 12766: Return an index not an array
 | 
			
		||||
        tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
 | 
			
		||||
    def test_validate_inplace_raises(self, value):
 | 
			
		||||
        cat = Categorical(["A", "B", "B", "C", "A"])
 | 
			
		||||
        msg = (
 | 
			
		||||
            'For argument "inplace" expected type bool, '
 | 
			
		||||
            f"received type {type(value).__name__}"
 | 
			
		||||
        )
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            cat.set_ordered(value=True, inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            cat.as_ordered(inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            cat.as_unordered(inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.rename_categories(["X", "Y", "Z"], inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.add_categories(new_categories=["D", "E", "F"], inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.remove_categories(removals=["D", "E", "F"], inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            with tm.assert_produces_warning(FutureWarning):
 | 
			
		||||
                # issue #37643 inplace kwarg deprecated
 | 
			
		||||
                cat.remove_unused_categories(inplace=value)
 | 
			
		||||
 | 
			
		||||
        with pytest.raises(ValueError, match=msg):
 | 
			
		||||
            cat.sort_values(inplace=value)
 | 
			
		||||
		Reference in New Issue
	
	Block a user