214 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import collections
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas.core.dtypes.dtypes import CategoricalDtype
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    DataFrame,
 | 
						|
    Index,
 | 
						|
    Series,
 | 
						|
    isna,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
 | 
						|
class TestCategoricalMissing:
 | 
						|
    def test_isna(self):
 | 
						|
        exp = np.array([False, False, True])
 | 
						|
        cat = Categorical(["a", "b", np.nan])
 | 
						|
        res = cat.isna()
 | 
						|
 | 
						|
        tm.assert_numpy_array_equal(res, exp)
 | 
						|
 | 
						|
    def test_na_flags_int_categories(self):
 | 
						|
        # #1457
 | 
						|
 | 
						|
        categories = list(range(10))
 | 
						|
        labels = np.random.randint(0, 10, 20)
 | 
						|
        labels[::5] = -1
 | 
						|
 | 
						|
        cat = Categorical(labels, categories, fastpath=True)
 | 
						|
        repr(cat)
 | 
						|
 | 
						|
        tm.assert_numpy_array_equal(isna(cat), labels == -1)
 | 
						|
 | 
						|
    def test_nan_handling(self):
 | 
						|
 | 
						|
        # Nans are represented as -1 in codes
 | 
						|
        c = Categorical(["a", "b", np.nan, "a"])
 | 
						|
        tm.assert_index_equal(c.categories, Index(["a", "b"]))
 | 
						|
        tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
 | 
						|
        c[1] = np.nan
 | 
						|
        tm.assert_index_equal(c.categories, Index(["a", "b"]))
 | 
						|
        tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
 | 
						|
 | 
						|
        # Adding nan to categories should make assigned nan point to the
 | 
						|
        # category!
 | 
						|
        c = Categorical(["a", "b", np.nan, "a"])
 | 
						|
        tm.assert_index_equal(c.categories, Index(["a", "b"]))
 | 
						|
        tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
 | 
						|
 | 
						|
    def test_set_dtype_nans(self):
 | 
						|
        c = Categorical(["a", "b", np.nan])
 | 
						|
        result = c._set_dtype(CategoricalDtype(["a", "c"]))
 | 
						|
        tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
 | 
						|
 | 
						|
    def test_set_item_nan(self):
 | 
						|
        cat = Categorical([1, 2, 3])
 | 
						|
        cat[1] = np.nan
 | 
						|
 | 
						|
        exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
 | 
						|
        tm.assert_categorical_equal(cat, exp)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "fillna_kwargs, msg",
 | 
						|
        [
 | 
						|
            (
 | 
						|
                {"value": 1, "method": "ffill"},
 | 
						|
                "Cannot specify both 'value' and 'method'.",
 | 
						|
            ),
 | 
						|
            ({}, "Must specify a fill 'value' or 'method'."),
 | 
						|
            ({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
 | 
						|
            (
 | 
						|
                {"value": Series([1, 2, 3, 4, "a"])},
 | 
						|
                "Cannot setitem on a Categorical with a new category",
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_fillna_raises(self, fillna_kwargs, msg):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/19682
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/13628
 | 
						|
        cat = Categorical([1, 2, 3, None, None])
 | 
						|
 | 
						|
        if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
 | 
						|
            err = TypeError
 | 
						|
        else:
 | 
						|
            err = ValueError
 | 
						|
 | 
						|
        with pytest.raises(err, match=msg):
 | 
						|
            cat.fillna(**fillna_kwargs)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("named", [True, False])
 | 
						|
    def test_fillna_iterable_category(self, named):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/21097
 | 
						|
        if named:
 | 
						|
            Point = collections.namedtuple("Point", "x y")
 | 
						|
        else:
 | 
						|
            Point = lambda *args: args  # tuple
 | 
						|
        cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
 | 
						|
        result = cat.fillna(Point(0, 0))
 | 
						|
        expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
 | 
						|
 | 
						|
        tm.assert_categorical_equal(result, expected)
 | 
						|
 | 
						|
        # Case where the Point is not among our categories; we want ValueError,
 | 
						|
        #  not NotImplementedError GH#41914
 | 
						|
        cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
 | 
						|
        msg = "Cannot setitem on a Categorical with a new category"
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            cat.fillna(Point(0, 0))
 | 
						|
 | 
						|
    def test_fillna_array(self):
 | 
						|
        # accept Categorical or ndarray value if it holds appropriate values
 | 
						|
        cat = Categorical(["A", "B", "C", None, None])
 | 
						|
 | 
						|
        other = cat.fillna("C")
 | 
						|
        result = cat.fillna(other)
 | 
						|
        tm.assert_categorical_equal(result, other)
 | 
						|
        assert isna(cat[-1])  # didn't modify original inplace
 | 
						|
 | 
						|
        other = np.array(["A", "B", "C", "B", "A"])
 | 
						|
        result = cat.fillna(other)
 | 
						|
        expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
 | 
						|
        tm.assert_categorical_equal(result, expected)
 | 
						|
        assert isna(cat[-1])  # didn't modify original inplace
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "values, expected",
 | 
						|
        [
 | 
						|
            ([1, 2, 3], np.array([False, False, False])),
 | 
						|
            ([1, 2, np.nan], np.array([False, False, True])),
 | 
						|
            ([1, 2, np.inf], np.array([False, False, True])),
 | 
						|
            ([1, 2, pd.NA], np.array([False, False, True])),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_use_inf_as_na(self, values, expected):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/33594
 | 
						|
        with pd.option_context("mode.use_inf_as_na", True):
 | 
						|
            cat = Categorical(values)
 | 
						|
            result = cat.isna()
 | 
						|
            tm.assert_numpy_array_equal(result, expected)
 | 
						|
 | 
						|
            result = Series(cat).isna()
 | 
						|
            expected = Series(expected)
 | 
						|
            tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
            result = DataFrame(cat).isna()
 | 
						|
            expected = DataFrame(expected)
 | 
						|
            tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "values, expected",
 | 
						|
        [
 | 
						|
            ([1, 2, 3], np.array([False, False, False])),
 | 
						|
            ([1, 2, np.nan], np.array([False, False, True])),
 | 
						|
            ([1, 2, np.inf], np.array([False, False, True])),
 | 
						|
            ([1, 2, pd.NA], np.array([False, False, True])),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_use_inf_as_na_outside_context(self, values, expected):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/33594
 | 
						|
        # Using isna directly for Categorical will fail in general here
 | 
						|
        cat = Categorical(values)
 | 
						|
 | 
						|
        with pd.option_context("mode.use_inf_as_na", True):
 | 
						|
            result = isna(cat)
 | 
						|
            tm.assert_numpy_array_equal(result, expected)
 | 
						|
 | 
						|
            result = isna(Series(cat))
 | 
						|
            expected = Series(expected)
 | 
						|
            tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
            result = isna(DataFrame(cat))
 | 
						|
            expected = DataFrame(expected)
 | 
						|
            tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "a1, a2, categories",
 | 
						|
        [
 | 
						|
            (["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
 | 
						|
            ([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_compare_categorical_with_missing(self, a1, a2, categories):
 | 
						|
        # GH 28384
 | 
						|
        cat_type = CategoricalDtype(categories)
 | 
						|
 | 
						|
        # !=
 | 
						|
        result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
 | 
						|
        expected = Series(a1) != Series(a2)
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        # ==
 | 
						|
        result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
 | 
						|
        expected = Series(a1) == Series(a2)
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "na_value, dtype",
 | 
						|
        [
 | 
						|
            (pd.NaT, "datetime64[ns]"),
 | 
						|
            (None, "float64"),
 | 
						|
            (np.nan, "float64"),
 | 
						|
            (pd.NA, "float64"),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
 | 
						|
        # GH#44900
 | 
						|
        result = Categorical([na_value, na_value])
 | 
						|
        tm.assert_index_equal(result.categories, Index([], dtype=dtype))
 |