453 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			453 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""Tests dealing with the NDFrame.allows_duplicates."""
 | 
						|
import operator
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
not_implemented = pytest.mark.xfail(reason="Not implemented.")
 | 
						|
 | 
						|
# ----------------------------------------------------------------------------
 | 
						|
# Preservation
 | 
						|
 | 
						|
 | 
						|
class TestPreserves:
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "cls, data",
 | 
						|
        [
 | 
						|
            (pd.Series, np.array([])),
 | 
						|
            (pd.Series, [1, 2]),
 | 
						|
            (pd.DataFrame, {}),
 | 
						|
            (pd.DataFrame, {"A": [1, 2]}),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_construction_ok(self, cls, data):
 | 
						|
        result = cls(data)
 | 
						|
        assert result.flags.allows_duplicate_labels is True
 | 
						|
 | 
						|
        result = cls(data).set_flags(allows_duplicate_labels=False)
 | 
						|
        assert result.flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "func",
 | 
						|
        [
 | 
						|
            operator.itemgetter(["a"]),
 | 
						|
            operator.methodcaller("add", 1),
 | 
						|
            operator.methodcaller("rename", str.upper),
 | 
						|
            operator.methodcaller("rename", "name"),
 | 
						|
            operator.methodcaller("abs"),
 | 
						|
            np.abs,
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_preserved_series(self, func):
 | 
						|
        s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
 | 
						|
        assert func(s).flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])]
 | 
						|
    )
 | 
						|
    # TODO: frame
 | 
						|
    @not_implemented
 | 
						|
    def test_align(self, other):
 | 
						|
        s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
 | 
						|
        a, b = s.align(other)
 | 
						|
        assert a.flags.allows_duplicate_labels is False
 | 
						|
        assert b.flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    def test_preserved_frame(self):
 | 
						|
        df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
 | 
						|
            allows_duplicate_labels=False
 | 
						|
        )
 | 
						|
        assert df.loc[["a"]].flags.allows_duplicate_labels is False
 | 
						|
        assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @not_implemented
 | 
						|
    def test_to_frame(self):
 | 
						|
        s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
 | 
						|
        assert s.to_frame().flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.parametrize("func", ["add", "sub"])
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "frame", [False, pytest.param(True, marks=not_implemented)]
 | 
						|
    )
 | 
						|
    @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
 | 
						|
    def test_binops(self, func, other, frame):
 | 
						|
        df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
 | 
						|
            allows_duplicate_labels=False
 | 
						|
        )
 | 
						|
        if frame:
 | 
						|
            df = df.to_frame()
 | 
						|
        if isinstance(other, pd.Series) and frame:
 | 
						|
            other = other.to_frame()
 | 
						|
        func = operator.methodcaller(func, other)
 | 
						|
        assert df.flags.allows_duplicate_labels is False
 | 
						|
        assert func(df).flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @not_implemented
 | 
						|
    def test_preserve_getitem(self):
 | 
						|
        df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
 | 
						|
        assert df[["A"]].flags.allows_duplicate_labels is False
 | 
						|
        assert df["A"].flags.allows_duplicate_labels is False
 | 
						|
        assert df.loc[0].flags.allows_duplicate_labels is False
 | 
						|
        assert df.loc[[0]].flags.allows_duplicate_labels is False
 | 
						|
        assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.xfail(reason="Unclear behavior.")
 | 
						|
    def test_ndframe_getitem_caching_issue(self):
 | 
						|
        # NDFrame.__getitem__ will cache the first df['A']. May need to
 | 
						|
        # invalidate that cache? Update the cached entries?
 | 
						|
        df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
 | 
						|
        assert df["A"].flags.allows_duplicate_labels is False
 | 
						|
        df.flags.allows_duplicate_labels = True
 | 
						|
        assert df["A"].flags.allows_duplicate_labels is True
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "objs, kwargs",
 | 
						|
        [
 | 
						|
            # Series
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.Series(1, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.Series(2, index=["c", "d"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {},
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.Series(1, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.Series(2, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"ignore_index": True},
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.Series(1, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.Series(2, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"axis": 1},
 | 
						|
            ),
 | 
						|
            # Frame
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {},
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"ignore_index": True},
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"axis": 1},
 | 
						|
            ),
 | 
						|
            # Series / Frame
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.Series([1, 2], index=["a", "b"], name="B",).set_flags(
 | 
						|
                        allows_duplicate_labels=False,
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"axis": 1},
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_concat(self, objs, kwargs):
 | 
						|
        result = pd.concat(objs, **kwargs)
 | 
						|
        assert result.flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "left, right, kwargs, expected",
 | 
						|
        [
 | 
						|
            # false false false
 | 
						|
            pytest.param(
 | 
						|
                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
 | 
						|
                    allows_duplicate_labels=False
 | 
						|
                ),
 | 
						|
                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
 | 
						|
                    allows_duplicate_labels=False
 | 
						|
                ),
 | 
						|
                {"left_index": True, "right_index": True},
 | 
						|
                False,
 | 
						|
                marks=not_implemented,
 | 
						|
            ),
 | 
						|
            # false true false
 | 
						|
            pytest.param(
 | 
						|
                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
 | 
						|
                    allows_duplicate_labels=False
 | 
						|
                ),
 | 
						|
                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
 | 
						|
                {"left_index": True, "right_index": True},
 | 
						|
                False,
 | 
						|
                marks=not_implemented,
 | 
						|
            ),
 | 
						|
            # true true true
 | 
						|
            (
 | 
						|
                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
 | 
						|
                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
 | 
						|
                {"left_index": True, "right_index": True},
 | 
						|
                True,
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge(self, left, right, kwargs, expected):
 | 
						|
        result = pd.merge(left, right, **kwargs)
 | 
						|
        assert result.flags.allows_duplicate_labels is expected
 | 
						|
 | 
						|
    @not_implemented
 | 
						|
    def test_groupby(self):
 | 
						|
        # XXX: This is under tested
 | 
						|
        # TODO:
 | 
						|
        #  - apply
 | 
						|
        #  - transform
 | 
						|
        #  - Should passing a grouper that disallows duplicates propagate?
 | 
						|
        df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
 | 
						|
        result = df.groupby([0, 0, 1]).agg("count")
 | 
						|
        assert result.flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
    @pytest.mark.parametrize("frame", [True, False])
 | 
						|
    @not_implemented
 | 
						|
    def test_window(self, frame):
 | 
						|
        df = pd.Series(
 | 
						|
            1,
 | 
						|
            index=pd.date_range("2000", periods=12),
 | 
						|
            name="A",
 | 
						|
            allows_duplicate_labels=False,
 | 
						|
        )
 | 
						|
        if frame:
 | 
						|
            df = df.to_frame()
 | 
						|
        assert df.rolling(3).mean().flags.allows_duplicate_labels is False
 | 
						|
        assert df.ewm(3).mean().flags.allows_duplicate_labels is False
 | 
						|
        assert df.expanding(3).mean().flags.allows_duplicate_labels is False
 | 
						|
 | 
						|
 | 
						|
# ----------------------------------------------------------------------------
 | 
						|
# Raises
 | 
						|
 | 
						|
 | 
						|
class TestRaises:
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "cls, axes",
 | 
						|
        [
 | 
						|
            (pd.Series, {"index": ["a", "a"], "dtype": float}),
 | 
						|
            (pd.DataFrame, {"index": ["a", "a"]}),
 | 
						|
            (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
 | 
						|
            (pd.DataFrame, {"columns": ["b", "b"]}),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_set_flags_with_duplicates(self, cls, axes):
 | 
						|
        result = cls(**axes)
 | 
						|
        assert result.flags.allows_duplicate_labels is True
 | 
						|
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            cls(**axes).set_flags(allows_duplicate_labels=False)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "data",
 | 
						|
        [
 | 
						|
            pd.Series(index=[0, 0], dtype=float),
 | 
						|
            pd.DataFrame(index=[0, 0]),
 | 
						|
            pd.DataFrame(columns=[0, 0]),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_setting_allows_duplicate_labels_raises(self, data):
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            data.flags.allows_duplicate_labels = False
 | 
						|
 | 
						|
        assert data.flags.allows_duplicate_labels is True
 | 
						|
 | 
						|
    def test_series_raises(self):
 | 
						|
        a = pd.Series(0, index=["a", "b"])
 | 
						|
        b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            pd.concat([a, b])
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "getter, target",
 | 
						|
        [
 | 
						|
            (operator.itemgetter(["A", "A"]), None),
 | 
						|
            # loc
 | 
						|
            (operator.itemgetter(["a", "a"]), "loc"),
 | 
						|
            pytest.param(
 | 
						|
                operator.itemgetter(("a", ["A", "A"])), "loc", marks=not_implemented
 | 
						|
            ),
 | 
						|
            (operator.itemgetter((["a", "a"], "A")), "loc"),
 | 
						|
            # iloc
 | 
						|
            (operator.itemgetter([0, 0]), "iloc"),
 | 
						|
            pytest.param(
 | 
						|
                operator.itemgetter((0, [0, 0])), "iloc", marks=not_implemented
 | 
						|
            ),
 | 
						|
            pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_getitem_raises(self, getter, target):
 | 
						|
        df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
 | 
						|
            allows_duplicate_labels=False
 | 
						|
        )
 | 
						|
        if target:
 | 
						|
            # df, df.loc, or df.iloc
 | 
						|
            target = getattr(df, target)
 | 
						|
        else:
 | 
						|
            target = df
 | 
						|
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            getter(target)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "objs, kwargs",
 | 
						|
        [
 | 
						|
            (
 | 
						|
                [
 | 
						|
                    pd.Series(1, index=[0, 1], name="a").set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                    pd.Series(2, index=[0, 1], name="a").set_flags(
 | 
						|
                        allows_duplicate_labels=False
 | 
						|
                    ),
 | 
						|
                ],
 | 
						|
                {"axis": 1},
 | 
						|
            )
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_concat_raises(self, objs, kwargs):
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            pd.concat(objs, **kwargs)
 | 
						|
 | 
						|
    @not_implemented
 | 
						|
    def test_merge_raises(self):
 | 
						|
        a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
 | 
						|
            allows_duplicate_labels=False
 | 
						|
        )
 | 
						|
        b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
 | 
						|
        msg = "Index has duplicates."
 | 
						|
        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
            pd.merge(a, b, left_index=True, right_index=True)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "idx",
 | 
						|
    [
 | 
						|
        pd.Index([1, 1]),
 | 
						|
        pd.Index(["a", "a"]),
 | 
						|
        pd.Index([1.1, 1.1]),
 | 
						|
        pd.PeriodIndex([pd.Period("2000", "D")] * 2),
 | 
						|
        pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
 | 
						|
        pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
 | 
						|
        pd.CategoricalIndex(["a", "a"]),
 | 
						|
        pd.IntervalIndex([pd.Interval(0, 1)] * 2),
 | 
						|
        pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
 | 
						|
    ],
 | 
						|
    ids=lambda x: type(x).__name__,
 | 
						|
)
 | 
						|
def test_raises_basic(idx):
 | 
						|
    msg = "Index has duplicates."
 | 
						|
    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
        pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
 | 
						|
 | 
						|
    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
        pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
 | 
						|
 | 
						|
    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
 | 
						|
        pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
 | 
						|
 | 
						|
 | 
						|
def test_format_duplicate_labels_message():
 | 
						|
    idx = pd.Index(["a", "b", "a", "b", "c"])
 | 
						|
    result = idx._format_duplicate_message()
 | 
						|
    expected = pd.DataFrame(
 | 
						|
        {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_format_duplicate_labels_message_multi():
 | 
						|
    idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
 | 
						|
    result = idx._format_duplicate_message()
 | 
						|
    expected = pd.DataFrame(
 | 
						|
        {"positions": [[0, 2], [1, 3]]},
 | 
						|
        index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_dataframe_insert_raises():
 | 
						|
    df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
 | 
						|
    msg = "Cannot specify"
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        df.insert(0, "A", [3, 4], allow_duplicates=True)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "method, frame_only",
 | 
						|
    [
 | 
						|
        (operator.methodcaller("set_index", "A", inplace=True), True),
 | 
						|
        (operator.methodcaller("set_axis", ["A", "B"], inplace=True), False),
 | 
						|
        (operator.methodcaller("reset_index", inplace=True), True),
 | 
						|
        (operator.methodcaller("rename", lambda x: x, inplace=True), False),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_inplace_raises(method, frame_only):
 | 
						|
    df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
 | 
						|
        allows_duplicate_labels=False
 | 
						|
    )
 | 
						|
    s = df["A"]
 | 
						|
    s.flags.allows_duplicate_labels = False
 | 
						|
    msg = "Cannot specify"
 | 
						|
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        method(df)
 | 
						|
    if not frame_only:
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            method(s)
 | 
						|
 | 
						|
 | 
						|
def test_pickle():
 | 
						|
    a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
 | 
						|
    b = tm.round_trip_pickle(a)
 | 
						|
    tm.assert_series_equal(a, b)
 | 
						|
 | 
						|
    a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
 | 
						|
    b = tm.round_trip_pickle(a)
 | 
						|
    tm.assert_frame_equal(a, b)
 |