371 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			371 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import itertools
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas.api.extensions import ExtensionArray
 | 
						|
from pandas.core.internals.blocks import EABackedBlock
 | 
						|
from pandas.tests.extension.base.base import BaseExtensionTests
 | 
						|
 | 
						|
 | 
						|
class BaseReshapingTests(BaseExtensionTests):
 | 
						|
    """Tests for reshaping and concatenation."""
 | 
						|
 | 
						|
    @pytest.mark.parametrize("in_frame", [True, False])
 | 
						|
    def test_concat(self, data, in_frame):
 | 
						|
        wrapped = pd.Series(data)
 | 
						|
        if in_frame:
 | 
						|
            wrapped = pd.DataFrame(wrapped)
 | 
						|
        result = pd.concat([wrapped, wrapped], ignore_index=True)
 | 
						|
 | 
						|
        assert len(result) == len(data) * 2
 | 
						|
 | 
						|
        if in_frame:
 | 
						|
            dtype = result.dtypes[0]
 | 
						|
        else:
 | 
						|
            dtype = result.dtype
 | 
						|
 | 
						|
        assert dtype == data.dtype
 | 
						|
        if hasattr(result._mgr, "blocks"):
 | 
						|
            assert isinstance(result._mgr.blocks[0], EABackedBlock)
 | 
						|
        assert isinstance(result._mgr.arrays[0], ExtensionArray)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("in_frame", [True, False])
 | 
						|
    def test_concat_all_na_block(self, data_missing, in_frame):
 | 
						|
        valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
 | 
						|
        na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
 | 
						|
        if in_frame:
 | 
						|
            valid_block = pd.DataFrame({"a": valid_block})
 | 
						|
            na_block = pd.DataFrame({"a": na_block})
 | 
						|
        result = pd.concat([valid_block, na_block])
 | 
						|
        if in_frame:
 | 
						|
            expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
 | 
						|
            self.assert_frame_equal(result, expected)
 | 
						|
        else:
 | 
						|
            expected = pd.Series(data_missing.take([1, 1, 0, 0]))
 | 
						|
            self.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_mixed_dtypes(self, data):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/20762
 | 
						|
        df1 = pd.DataFrame({"A": data[:3]})
 | 
						|
        df2 = pd.DataFrame({"A": [1, 2, 3]})
 | 
						|
        df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
 | 
						|
        dfs = [df1, df2, df3]
 | 
						|
 | 
						|
        # dataframes
 | 
						|
        result = pd.concat(dfs)
 | 
						|
        expected = pd.concat([x.astype(object) for x in dfs])
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # series
 | 
						|
        result = pd.concat([x["A"] for x in dfs])
 | 
						|
        expected = pd.concat([x["A"].astype(object) for x in dfs])
 | 
						|
        self.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        # simple test for just EA and one other
 | 
						|
        result = pd.concat([df1, df2.astype(object)])
 | 
						|
        expected = pd.concat([df1.astype("object"), df2.astype("object")])
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = pd.concat([df1["A"], df2["A"].astype(object)])
 | 
						|
        expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
 | 
						|
        self.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_columns(self, data, na_value):
 | 
						|
        df1 = pd.DataFrame({"A": data[:3]})
 | 
						|
        df2 = pd.DataFrame({"B": [1, 2, 3]})
 | 
						|
 | 
						|
        expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
 | 
						|
        result = pd.concat([df1, df2], axis=1)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
        result = pd.concat([df1["A"], df2["B"]], axis=1)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # non-aligned
 | 
						|
        df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
 | 
						|
        expected = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
 | 
						|
                "B": [np.nan, 1, 2, 3],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        result = pd.concat([df1, df2], axis=1)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
        result = pd.concat([df1["A"], df2["B"]], axis=1)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_extension_arrays_copy_false(self, data, na_value):
 | 
						|
        # GH 20756
 | 
						|
        df1 = pd.DataFrame({"A": data[:3]})
 | 
						|
        df2 = pd.DataFrame({"B": data[3:7]})
 | 
						|
        expected = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
 | 
						|
                "B": data[3:7],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = pd.concat([df1, df2], axis=1, copy=False)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_with_reindex(self, data):
 | 
						|
        # GH-33027
 | 
						|
        a = pd.DataFrame({"a": data[:5]})
 | 
						|
        b = pd.DataFrame({"b": data[:5]})
 | 
						|
        result = pd.concat([a, b], ignore_index=True)
 | 
						|
        expected = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
 | 
						|
                "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_align(self, data, na_value):
 | 
						|
        a = data[:3]
 | 
						|
        b = data[2:5]
 | 
						|
        r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
 | 
						|
 | 
						|
        # Assumes that the ctor can take a list of scalars of the type
 | 
						|
        e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
 | 
						|
        e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
 | 
						|
        self.assert_series_equal(r1, e1)
 | 
						|
        self.assert_series_equal(r2, e2)
 | 
						|
 | 
						|
    def test_align_frame(self, data, na_value):
 | 
						|
        a = data[:3]
 | 
						|
        b = data[2:5]
 | 
						|
        r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
 | 
						|
 | 
						|
        # Assumes that the ctor can take a list of scalars of the type
 | 
						|
        e1 = pd.DataFrame(
 | 
						|
            {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
 | 
						|
        )
 | 
						|
        e2 = pd.DataFrame(
 | 
						|
            {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(r1, e1)
 | 
						|
        self.assert_frame_equal(r2, e2)
 | 
						|
 | 
						|
    def test_align_series_frame(self, data, na_value):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/20576
 | 
						|
        ser = pd.Series(data, name="a")
 | 
						|
        df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
 | 
						|
        r1, r2 = ser.align(df)
 | 
						|
 | 
						|
        e1 = pd.Series(
 | 
						|
            data._from_sequence(list(data) + [na_value], dtype=data.dtype),
 | 
						|
            name=ser.name,
 | 
						|
        )
 | 
						|
 | 
						|
        self.assert_series_equal(r1, e1)
 | 
						|
        self.assert_frame_equal(r2, df)
 | 
						|
 | 
						|
    def test_set_frame_expand_regular_with_extension(self, data):
 | 
						|
        df = pd.DataFrame({"A": [1] * len(data)})
 | 
						|
        df["B"] = data
 | 
						|
        expected = pd.DataFrame({"A": [1] * len(data), "B": data})
 | 
						|
        self.assert_frame_equal(df, expected)
 | 
						|
 | 
						|
    def test_set_frame_expand_extension_with_regular(self, data):
 | 
						|
        df = pd.DataFrame({"A": data})
 | 
						|
        df["B"] = [1] * len(data)
 | 
						|
        expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
 | 
						|
        self.assert_frame_equal(df, expected)
 | 
						|
 | 
						|
    def test_set_frame_overwrite_object(self, data):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/20555
 | 
						|
        df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
 | 
						|
        df["A"] = data
 | 
						|
        assert df.dtypes["A"] == data.dtype
 | 
						|
 | 
						|
    def test_merge(self, data, na_value):
 | 
						|
        # GH-20743
 | 
						|
        df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
 | 
						|
        df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
 | 
						|
 | 
						|
        res = pd.merge(df1, df2)
 | 
						|
        exp = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "int1": [1, 1, 2],
 | 
						|
                "int2": [1, 2, 3],
 | 
						|
                "key": [0, 0, 1],
 | 
						|
                "ext": data._from_sequence(
 | 
						|
                    [data[0], data[0], data[1]], dtype=data.dtype
 | 
						|
                ),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
 | 
						|
 | 
						|
        res = pd.merge(df1, df2, how="outer")
 | 
						|
        exp = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "int1": [1, 1, 2, 3, np.nan],
 | 
						|
                "int2": [1, 2, 3, np.nan, 4],
 | 
						|
                "key": [0, 0, 1, 2, 3],
 | 
						|
                "ext": data._from_sequence(
 | 
						|
                    [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
 | 
						|
                ),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
 | 
						|
 | 
						|
    def test_merge_on_extension_array(self, data):
 | 
						|
        # GH 23020
 | 
						|
        a, b = data[:2]
 | 
						|
        key = type(data)._from_sequence([a, b], dtype=data.dtype)
 | 
						|
 | 
						|
        df = pd.DataFrame({"key": key, "val": [1, 2]})
 | 
						|
        result = pd.merge(df, df, on="key")
 | 
						|
        expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # order
 | 
						|
        result = pd.merge(df.iloc[[1, 0]], df, on="key")
 | 
						|
        expected = expected.iloc[[1, 0]].reset_index(drop=True)
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_on_extension_array_duplicates(self, data):
 | 
						|
        # GH 23020
 | 
						|
        a, b = data[:2]
 | 
						|
        key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
 | 
						|
        df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
 | 
						|
        df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
 | 
						|
 | 
						|
        result = pd.merge(df1, df2, on="key")
 | 
						|
        expected = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "key": key.take([0, 0, 0, 0, 1]),
 | 
						|
                "val_x": [1, 1, 3, 3, 2],
 | 
						|
                "val_y": [1, 3, 1, 3, 2],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "columns",
 | 
						|
        [
 | 
						|
            ["A", "B"],
 | 
						|
            pd.MultiIndex.from_tuples(
 | 
						|
                [("A", "a"), ("A", "b")], names=["outer", "inner"]
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_stack(self, data, columns):
 | 
						|
        df = pd.DataFrame({"A": data[:5], "B": data[:5]})
 | 
						|
        df.columns = columns
 | 
						|
        result = df.stack()
 | 
						|
        expected = df.astype(object).stack()
 | 
						|
        # we need a second astype(object), in case the constructor inferred
 | 
						|
        # object -> specialized, as is done for period.
 | 
						|
        expected = expected.astype(object)
 | 
						|
 | 
						|
        if isinstance(expected, pd.Series):
 | 
						|
            assert result.dtype == df.iloc[:, 0].dtype
 | 
						|
        else:
 | 
						|
            assert all(result.dtypes == df.iloc[:, 0].dtype)
 | 
						|
 | 
						|
        result = result.astype(object)
 | 
						|
        self.assert_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "index",
 | 
						|
        [
 | 
						|
            # Two levels, uniform.
 | 
						|
            pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
 | 
						|
            # non-uniform
 | 
						|
            pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
 | 
						|
            # three levels, non-uniform
 | 
						|
            pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
 | 
						|
            pd.MultiIndex.from_tuples(
 | 
						|
                [
 | 
						|
                    ("A", "a", 1),
 | 
						|
                    ("A", "b", 0),
 | 
						|
                    ("A", "a", 0),
 | 
						|
                    ("B", "a", 0),
 | 
						|
                    ("B", "c", 1),
 | 
						|
                ]
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    @pytest.mark.parametrize("obj", ["series", "frame"])
 | 
						|
    def test_unstack(self, data, index, obj):
 | 
						|
        data = data[: len(index)]
 | 
						|
        if obj == "series":
 | 
						|
            ser = pd.Series(data, index=index)
 | 
						|
        else:
 | 
						|
            ser = pd.DataFrame({"A": data, "B": data}, index=index)
 | 
						|
 | 
						|
        n = index.nlevels
 | 
						|
        levels = list(range(n))
 | 
						|
        # [0, 1, 2]
 | 
						|
        # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
 | 
						|
        combinations = itertools.chain.from_iterable(
 | 
						|
            itertools.permutations(levels, i) for i in range(1, n)
 | 
						|
        )
 | 
						|
 | 
						|
        for level in combinations:
 | 
						|
            result = ser.unstack(level=level)
 | 
						|
            assert all(
 | 
						|
                isinstance(result[col].array, type(data)) for col in result.columns
 | 
						|
            )
 | 
						|
 | 
						|
            if obj == "series":
 | 
						|
                # We should get the same result with to_frame+unstack+droplevel
 | 
						|
                df = ser.to_frame()
 | 
						|
 | 
						|
                alt = df.unstack(level=level).droplevel(0, axis=1)
 | 
						|
                self.assert_frame_equal(result, alt)
 | 
						|
 | 
						|
            expected = ser.astype(object).unstack(
 | 
						|
                level=level, fill_value=data.dtype.na_value
 | 
						|
            )
 | 
						|
            if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype):
 | 
						|
                # GH#34457 SparseArray.astype(object) gives Sparse[object]
 | 
						|
                #  instead of np.dtype(object)
 | 
						|
                assert (expected.dtypes == object).all()
 | 
						|
 | 
						|
            result = result.astype(object)
 | 
						|
 | 
						|
            self.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_ravel(self, data):
 | 
						|
        # as long as EA is 1D-only, ravel is a no-op
 | 
						|
        result = data.ravel()
 | 
						|
        assert type(result) == type(data)
 | 
						|
 | 
						|
        # Check that we have a view, not a copy
 | 
						|
        result[0] = result[1]
 | 
						|
        assert data[0] == data[1]
 | 
						|
 | 
						|
    def test_transpose(self, data):
 | 
						|
        result = data.transpose()
 | 
						|
        assert type(result) == type(data)
 | 
						|
 | 
						|
        # check we get a new object
 | 
						|
        assert result is not data
 | 
						|
 | 
						|
        # If we ever _did_ support 2D, shape should be reversed
 | 
						|
        assert result.shape == data.shape[::-1]
 | 
						|
 | 
						|
        # Check that we have a view, not a copy
 | 
						|
        result[0] = result[1]
 | 
						|
        assert data[0] == data[1]
 | 
						|
 | 
						|
    def test_transpose_frame(self, data):
 | 
						|
        df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
 | 
						|
        result = df.T
 | 
						|
        expected = pd.DataFrame(
 | 
						|
            {
 | 
						|
                "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
 | 
						|
                "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
 | 
						|
                "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
 | 
						|
                "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
 | 
						|
            },
 | 
						|
            index=["A", "B"],
 | 
						|
        )
 | 
						|
        self.assert_frame_equal(result, expected)
 | 
						|
        self.assert_frame_equal(np.transpose(np.transpose(df)), df)
 | 
						|
        self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])
 |