340 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			340 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from itertools import product
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas._libs import hashtable
 | 
						|
 | 
						|
from pandas import (
 | 
						|
    DatetimeIndex,
 | 
						|
    MultiIndex,
 | 
						|
    Series,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("names", [None, ["first", "second"]])
 | 
						|
def test_unique(names):
 | 
						|
    mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
 | 
						|
 | 
						|
    res = mi.unique()
 | 
						|
    exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
 | 
						|
    tm.assert_index_equal(res, exp)
 | 
						|
 | 
						|
    mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names)
 | 
						|
    res = mi.unique()
 | 
						|
    exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names)
 | 
						|
    tm.assert_index_equal(res, exp)
 | 
						|
 | 
						|
    mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names)
 | 
						|
    res = mi.unique()
 | 
						|
    exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names)
 | 
						|
    tm.assert_index_equal(res, exp)
 | 
						|
 | 
						|
    # GH #20568 - empty MI
 | 
						|
    mi = MultiIndex.from_arrays([[], []], names=names)
 | 
						|
    res = mi.unique()
 | 
						|
    tm.assert_index_equal(mi, res)
 | 
						|
 | 
						|
 | 
						|
def test_unique_datetimelike():
 | 
						|
    idx1 = DatetimeIndex(
 | 
						|
        ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"]
 | 
						|
    )
 | 
						|
    idx2 = DatetimeIndex(
 | 
						|
        ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"],
 | 
						|
        tz="Asia/Tokyo",
 | 
						|
    )
 | 
						|
    result = MultiIndex.from_arrays([idx1, idx2]).unique()
 | 
						|
 | 
						|
    eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"])
 | 
						|
    eidx2 = DatetimeIndex(
 | 
						|
        ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo"
 | 
						|
    )
 | 
						|
    exp = MultiIndex.from_arrays([eidx1, eidx2])
 | 
						|
    tm.assert_index_equal(result, exp)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("level", [0, "first", 1, "second"])
 | 
						|
def test_unique_level(idx, level):
 | 
						|
    # GH #17896 - with level= argument
 | 
						|
    result = idx.unique(level=level)
 | 
						|
    expected = idx.get_level_values(level).unique()
 | 
						|
    tm.assert_index_equal(result, expected)
 | 
						|
 | 
						|
    # With already unique level
 | 
						|
    mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"])
 | 
						|
    result = mi.unique(level=level)
 | 
						|
    expected = mi.get_level_values(level)
 | 
						|
    tm.assert_index_equal(result, expected)
 | 
						|
 | 
						|
    # With empty MI
 | 
						|
    mi = MultiIndex.from_arrays([[], []], names=["first", "second"])
 | 
						|
    result = mi.unique(level=level)
 | 
						|
    expected = mi.get_level_values(level)
 | 
						|
    tm.assert_index_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_duplicate_multiindex_codes():
 | 
						|
    # GH 17464
 | 
						|
    # Make sure that a MultiIndex with duplicate levels throws a ValueError
 | 
						|
    msg = r"Level values must be unique: \[[A', ]+\] on level 0"
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)])
 | 
						|
 | 
						|
    # And that using set_levels with duplicate levels fails
 | 
						|
    mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
 | 
						|
    msg = r"Level values must be unique: \[[AB', ]+\] on level 0"
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        with tm.assert_produces_warning(FutureWarning):
 | 
						|
            mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]])
 | 
						|
def test_duplicate_level_names(names):
 | 
						|
    # GH18872, GH19029
 | 
						|
    mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
 | 
						|
    assert mi.names == names
 | 
						|
 | 
						|
    # With .rename()
 | 
						|
    mi = MultiIndex.from_product([[0, 1]] * 3)
 | 
						|
    mi = mi.rename(names)
 | 
						|
    assert mi.names == names
 | 
						|
 | 
						|
    # With .rename(., level=)
 | 
						|
    mi.rename(names[1], level=1, inplace=True)
 | 
						|
    mi = mi.rename([names[0], names[2]], level=[0, 2])
 | 
						|
    assert mi.names == names
 | 
						|
 | 
						|
 | 
						|
def test_duplicate_meta_data():
 | 
						|
    # GH 10115
 | 
						|
    mi = MultiIndex(
 | 
						|
        levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
 | 
						|
    )
 | 
						|
 | 
						|
    for idx in [
 | 
						|
        mi,
 | 
						|
        mi.set_names([None, None]),
 | 
						|
        mi.set_names([None, "Num"]),
 | 
						|
        mi.set_names(["Upper", "Num"]),
 | 
						|
    ]:
 | 
						|
        assert idx.has_duplicates
 | 
						|
        assert idx.drop_duplicates().names == idx.names
 | 
						|
 | 
						|
 | 
						|
def test_has_duplicates(idx, idx_dup):
 | 
						|
    # see fixtures
 | 
						|
    assert idx.is_unique is True
 | 
						|
    assert idx.has_duplicates is False
 | 
						|
    assert idx_dup.is_unique is False
 | 
						|
    assert idx_dup.has_duplicates is True
 | 
						|
 | 
						|
    mi = MultiIndex(
 | 
						|
        levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
 | 
						|
    )
 | 
						|
    assert mi.is_unique is False
 | 
						|
    assert mi.has_duplicates is True
 | 
						|
 | 
						|
    # single instance of NaN
 | 
						|
    mi_nan = MultiIndex(
 | 
						|
        levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]
 | 
						|
    )
 | 
						|
    assert mi_nan.is_unique is True
 | 
						|
    assert mi_nan.has_duplicates is False
 | 
						|
 | 
						|
    # multiple instances of NaN
 | 
						|
    mi_nan_dup = MultiIndex(
 | 
						|
        levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]
 | 
						|
    )
 | 
						|
    assert mi_nan_dup.is_unique is False
 | 
						|
    assert mi_nan_dup.has_duplicates is True
 | 
						|
 | 
						|
 | 
						|
def test_has_duplicates_from_tuples():
 | 
						|
    # GH 9075
 | 
						|
    t = [
 | 
						|
        ("x", "out", "z", 5, "y", "in", "z", 169),
 | 
						|
        ("x", "out", "z", 7, "y", "in", "z", 119),
 | 
						|
        ("x", "out", "z", 9, "y", "in", "z", 135),
 | 
						|
        ("x", "out", "z", 13, "y", "in", "z", 145),
 | 
						|
        ("x", "out", "z", 14, "y", "in", "z", 158),
 | 
						|
        ("x", "out", "z", 16, "y", "in", "z", 122),
 | 
						|
        ("x", "out", "z", 17, "y", "in", "z", 160),
 | 
						|
        ("x", "out", "z", 18, "y", "in", "z", 180),
 | 
						|
        ("x", "out", "z", 20, "y", "in", "z", 143),
 | 
						|
        ("x", "out", "z", 21, "y", "in", "z", 128),
 | 
						|
        ("x", "out", "z", 22, "y", "in", "z", 129),
 | 
						|
        ("x", "out", "z", 25, "y", "in", "z", 111),
 | 
						|
        ("x", "out", "z", 28, "y", "in", "z", 114),
 | 
						|
        ("x", "out", "z", 29, "y", "in", "z", 121),
 | 
						|
        ("x", "out", "z", 31, "y", "in", "z", 126),
 | 
						|
        ("x", "out", "z", 32, "y", "in", "z", 155),
 | 
						|
        ("x", "out", "z", 33, "y", "in", "z", 123),
 | 
						|
        ("x", "out", "z", 12, "y", "in", "z", 144),
 | 
						|
    ]
 | 
						|
 | 
						|
    mi = MultiIndex.from_tuples(t)
 | 
						|
    assert not mi.has_duplicates
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("nlevels", [4, 8])
 | 
						|
@pytest.mark.parametrize("with_nulls", [True, False])
 | 
						|
def test_has_duplicates_overflow(nlevels, with_nulls):
 | 
						|
    # handle int64 overflow if possible
 | 
						|
    # no overflow with 4
 | 
						|
    # overflow possible with 8
 | 
						|
    codes = np.tile(np.arange(500), 2)
 | 
						|
    level = np.arange(500)
 | 
						|
 | 
						|
    if with_nulls:  # inject some null values
 | 
						|
        codes[500] = -1  # common nan value
 | 
						|
        codes = [codes.copy() for i in range(nlevels)]
 | 
						|
        for i in range(nlevels):
 | 
						|
            codes[i][500 + i - nlevels // 2] = -1
 | 
						|
 | 
						|
        codes += [np.array([-1, 1]).repeat(500)]
 | 
						|
    else:
 | 
						|
        codes = [codes] * nlevels + [np.arange(2).repeat(500)]
 | 
						|
 | 
						|
    levels = [level] * nlevels + [[0, 1]]
 | 
						|
 | 
						|
    # no dups
 | 
						|
    mi = MultiIndex(levels=levels, codes=codes)
 | 
						|
    assert not mi.has_duplicates
 | 
						|
 | 
						|
    # with a dup
 | 
						|
    if with_nulls:
 | 
						|
 | 
						|
        def f(a):
 | 
						|
            return np.insert(a, 1000, a[0])
 | 
						|
 | 
						|
        codes = list(map(f, codes))
 | 
						|
        mi = MultiIndex(levels=levels, codes=codes)
 | 
						|
    else:
 | 
						|
        values = mi.values.tolist()
 | 
						|
        mi = MultiIndex.from_tuples(values + [values[0]])
 | 
						|
 | 
						|
    assert mi.has_duplicates
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "keep, expected",
 | 
						|
    [
 | 
						|
        ("first", np.array([False, False, False, True, True, False])),
 | 
						|
        ("last", np.array([False, True, True, False, False, False])),
 | 
						|
        (False, np.array([False, True, True, True, True, False])),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_duplicated(idx_dup, keep, expected):
 | 
						|
    result = idx_dup.duplicated(keep=keep)
 | 
						|
    tm.assert_numpy_array_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.arm_slow
 | 
						|
def test_duplicated_large(keep):
 | 
						|
    # GH 9125
 | 
						|
    n, k = 200, 5000
 | 
						|
    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
 | 
						|
    codes = [np.random.choice(n, k * n) for lev in levels]
 | 
						|
    mi = MultiIndex(levels=levels, codes=codes)
 | 
						|
 | 
						|
    result = mi.duplicated(keep=keep)
 | 
						|
    expected = hashtable.duplicated(mi.values, keep=keep)
 | 
						|
    tm.assert_numpy_array_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_duplicated2():
 | 
						|
    # TODO: more informative test name
 | 
						|
    # GH5873
 | 
						|
    for a in [101, 102]:
 | 
						|
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
 | 
						|
        assert not mi.has_duplicates
 | 
						|
 | 
						|
        tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool"))
 | 
						|
 | 
						|
    for n in range(1, 6):  # 1st level shape
 | 
						|
        for m in range(1, 5):  # 2nd level shape
 | 
						|
            # all possible unique combinations, including nan
 | 
						|
            codes = product(range(-1, n), range(-1, m))
 | 
						|
            mi = MultiIndex(
 | 
						|
                levels=[list("abcde")[:n], list("WXYZ")[:m]],
 | 
						|
                codes=np.random.permutation(list(codes)).T,
 | 
						|
            )
 | 
						|
            assert len(mi) == (n + 1) * (m + 1)
 | 
						|
            assert not mi.has_duplicates
 | 
						|
 | 
						|
            tm.assert_numpy_array_equal(
 | 
						|
                mi.duplicated(), np.zeros(len(mi), dtype="bool")
 | 
						|
            )
 | 
						|
 | 
						|
 | 
						|
def test_duplicated_drop_duplicates():
 | 
						|
    # GH#4060
 | 
						|
    idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
 | 
						|
 | 
						|
    expected = np.array([False, False, False, True, False, False], dtype=bool)
 | 
						|
    duplicated = idx.duplicated()
 | 
						|
    tm.assert_numpy_array_equal(duplicated, expected)
 | 
						|
    assert duplicated.dtype == bool
 | 
						|
    expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
 | 
						|
    tm.assert_index_equal(idx.drop_duplicates(), expected)
 | 
						|
 | 
						|
    expected = np.array([True, False, False, False, False, False])
 | 
						|
    duplicated = idx.duplicated(keep="last")
 | 
						|
    tm.assert_numpy_array_equal(duplicated, expected)
 | 
						|
    assert duplicated.dtype == bool
 | 
						|
    expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
 | 
						|
    tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected)
 | 
						|
 | 
						|
    expected = np.array([True, False, False, True, False, False])
 | 
						|
    duplicated = idx.duplicated(keep=False)
 | 
						|
    tm.assert_numpy_array_equal(duplicated, expected)
 | 
						|
    assert duplicated.dtype == bool
 | 
						|
    expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
 | 
						|
    tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "dtype",
 | 
						|
    [
 | 
						|
        np.complex64,
 | 
						|
        np.complex128,
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_duplicated_series_complex_numbers(dtype):
 | 
						|
    # GH 17927
 | 
						|
    expected = Series(
 | 
						|
        [False, False, False, True, False, False, False, True, False, True],
 | 
						|
        dtype=bool,
 | 
						|
    )
 | 
						|
    result = Series(
 | 
						|
        [
 | 
						|
            np.nan + np.nan * 1j,
 | 
						|
            0,
 | 
						|
            1j,
 | 
						|
            1j,
 | 
						|
            1,
 | 
						|
            1 + 1j,
 | 
						|
            1 + 2j,
 | 
						|
            1 + 1j,
 | 
						|
            np.nan,
 | 
						|
            np.nan + np.nan * 1j,
 | 
						|
        ],
 | 
						|
        dtype=dtype,
 | 
						|
    ).duplicated()
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_multi_drop_duplicates_pos_args_deprecation():
 | 
						|
    # GH#41485
 | 
						|
    idx = MultiIndex.from_arrays([[1, 2, 3, 1], [1, 2, 3, 1]])
 | 
						|
    msg = (
 | 
						|
        "In a future version of pandas all arguments of "
 | 
						|
        "MultiIndex.drop_duplicates will be keyword-only"
 | 
						|
    )
 | 
						|
    with tm.assert_produces_warning(FutureWarning, match=msg):
 | 
						|
        result = idx.drop_duplicates("last")
 | 
						|
    expected = MultiIndex.from_arrays([[2, 3, 1], [2, 3, 1]])
 | 
						|
    tm.assert_index_equal(expected, result)
 |