241 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			241 lines
		
	
	
		
			8.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import numpy as np
 | 
						|
 | 
						|
from pandas.core.dtypes.dtypes import CategoricalDtype
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    DataFrame,
 | 
						|
    Series,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
 | 
						|
class TestCategoricalConcat:
 | 
						|
    def test_categorical_concat(self, sort):
 | 
						|
        # See GH 10177
 | 
						|
        df1 = DataFrame(
 | 
						|
            np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"]
 | 
						|
        )
 | 
						|
 | 
						|
        df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"])
 | 
						|
 | 
						|
        cat_values = ["one", "one", "two", "one", "two", "two", "one"]
 | 
						|
        df2["h"] = Series(Categorical(cat_values))
 | 
						|
 | 
						|
        res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
 | 
						|
        exp = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
 | 
						|
                "b": [
 | 
						|
                    1,
 | 
						|
                    4,
 | 
						|
                    7,
 | 
						|
                    10,
 | 
						|
                    13,
 | 
						|
                    16,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                    np.nan,
 | 
						|
                ],
 | 
						|
                "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
 | 
						|
                "h": [None] * 6 + cat_values,
 | 
						|
            }
 | 
						|
        )
 | 
						|
        exp["h"] = exp["h"].astype(df2["h"].dtype)
 | 
						|
        tm.assert_frame_equal(res, exp)
 | 
						|
 | 
						|
    def test_categorical_concat_dtypes(self):
 | 
						|
 | 
						|
        # GH8143
 | 
						|
        index = ["cat", "obj", "num"]
 | 
						|
        cat = Categorical(["a", "b", "c"])
 | 
						|
        obj = Series(["a", "b", "c"])
 | 
						|
        num = Series([1, 2, 3])
 | 
						|
        df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
 | 
						|
 | 
						|
        result = df.dtypes == "object"
 | 
						|
        expected = Series([False, True, False], index=index)
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        result = df.dtypes == "int64"
 | 
						|
        expected = Series([False, False, True], index=index)
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        result = df.dtypes == "category"
 | 
						|
        expected = Series([True, False, False], index=index)
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_categoricalindex(self):
 | 
						|
        # GH 16111, categories that aren't lexsorted
 | 
						|
        categories = [9, 0, 1, 2, 3]
 | 
						|
 | 
						|
        a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories))
 | 
						|
        b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories))
 | 
						|
        c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories))
 | 
						|
 | 
						|
        result = pd.concat([a, b, c], axis=1)
 | 
						|
 | 
						|
        exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories)
 | 
						|
        exp = DataFrame(
 | 
						|
            {
 | 
						|
                0: [1, 1, np.nan, np.nan],
 | 
						|
                1: [np.nan, 2, 2, np.nan],
 | 
						|
                2: [np.nan, np.nan, 3, 3],
 | 
						|
            },
 | 
						|
            columns=[0, 1, 2],
 | 
						|
            index=exp_idx,
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
    def test_categorical_concat_preserve(self):
 | 
						|
 | 
						|
        # GH 8641  series concat not preserving category dtype
 | 
						|
        # GH 13524 can concat different categories
 | 
						|
        s = Series(list("abc"), dtype="category")
 | 
						|
        s2 = Series(list("abd"), dtype="category")
 | 
						|
 | 
						|
        exp = Series(list("abcabd"))
 | 
						|
        res = pd.concat([s, s2], ignore_index=True)
 | 
						|
        tm.assert_series_equal(res, exp)
 | 
						|
 | 
						|
        exp = Series(list("abcabc"), dtype="category")
 | 
						|
        res = pd.concat([s, s], ignore_index=True)
 | 
						|
        tm.assert_series_equal(res, exp)
 | 
						|
 | 
						|
        exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category")
 | 
						|
        res = pd.concat([s, s])
 | 
						|
        tm.assert_series_equal(res, exp)
 | 
						|
 | 
						|
        a = Series(np.arange(6, dtype="int64"))
 | 
						|
        b = Series(list("aabbca"))
 | 
						|
 | 
						|
        df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))})
 | 
						|
        res = pd.concat([df2, df2])
 | 
						|
        exp = DataFrame(
 | 
						|
            {
 | 
						|
                "A": pd.concat([a, a]),
 | 
						|
                "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(res, exp)
 | 
						|
 | 
						|
    def test_categorical_index_preserver(self):
 | 
						|
 | 
						|
        a = Series(np.arange(6, dtype="int64"))
 | 
						|
        b = Series(list("aabbca"))
 | 
						|
 | 
						|
        df2 = DataFrame(
 | 
						|
            {"A": a, "B": b.astype(CategoricalDtype(list("cab")))}
 | 
						|
        ).set_index("B")
 | 
						|
        result = pd.concat([df2, df2])
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "A": pd.concat([a, a]),
 | 
						|
                "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))),
 | 
						|
            }
 | 
						|
        ).set_index("B")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # wrong categories -> uses concat_compat, which casts to object
 | 
						|
        df3 = DataFrame(
 | 
						|
            {"A": a, "B": Categorical(b, categories=list("abe"))}
 | 
						|
        ).set_index("B")
 | 
						|
        result = pd.concat([df2, df3])
 | 
						|
        expected = pd.concat(
 | 
						|
            [
 | 
						|
                df2.set_axis(df2.index.astype(object), axis=0),
 | 
						|
                df3.set_axis(df3.index.astype(object), axis=0),
 | 
						|
            ]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_categorical_tz(self):
 | 
						|
        # GH-23816
 | 
						|
        a = Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific"))
 | 
						|
        b = Series(["a", "b"], dtype="category")
 | 
						|
        result = pd.concat([a, b], ignore_index=True)
 | 
						|
        expected = Series(
 | 
						|
            [
 | 
						|
                pd.Timestamp("2017-01-01", tz="US/Pacific"),
 | 
						|
                pd.Timestamp("2017-01-02", tz="US/Pacific"),
 | 
						|
                "a",
 | 
						|
                "b",
 | 
						|
            ]
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_concat_categorical_unchanged(self):
 | 
						|
        # GH-12007
 | 
						|
        # test fix for when concat on categorical and float
 | 
						|
        # coerces dtype categorical -> float
 | 
						|
        df = DataFrame(Series(["a", "b", "c"], dtype="category", name="A"))
 | 
						|
        ser = Series([0, 1, 2], index=[0, 1, 3], name="B")
 | 
						|
        result = pd.concat([df, ser], axis=1)
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "A": Series(["a", "b", "c", np.nan], dtype="category"),
 | 
						|
                "B": Series([0, 1, np.nan, 2], dtype="float"),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_equal(result, expected)
 | 
						|
 | 
						|
    def test_categorical_concat_gh7864(self):
 | 
						|
        # GH 7864
 | 
						|
        # make sure ordering is preserved
 | 
						|
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")})
 | 
						|
        df["grade"] = Categorical(df["raw_grade"])
 | 
						|
        df["grade"].cat.set_categories(["e", "a", "b"])
 | 
						|
 | 
						|
        df1 = df[0:3]
 | 
						|
        df2 = df[3:]
 | 
						|
 | 
						|
        tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories)
 | 
						|
        tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories)
 | 
						|
 | 
						|
        dfx = pd.concat([df1, df2])
 | 
						|
        tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories)
 | 
						|
 | 
						|
        dfa = df1._append(df2)
 | 
						|
        tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories)
 | 
						|
 | 
						|
    def test_categorical_index_upcast(self):
 | 
						|
        # GH 17629
 | 
						|
        # test upcasting to object when concatinating on categorical indexes
 | 
						|
        # with non-identical categories
 | 
						|
 | 
						|
        a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"]))
 | 
						|
        b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"]))
 | 
						|
 | 
						|
        res = pd.concat([a, b])
 | 
						|
        exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"])
 | 
						|
 | 
						|
        tm.assert_equal(res, exp)
 | 
						|
 | 
						|
        a = Series([1, 2], index=Categorical(["foo", "bar"]))
 | 
						|
        b = Series([4, 3], index=Categorical(["baz", "bar"]))
 | 
						|
 | 
						|
        res = pd.concat([a, b])
 | 
						|
        exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"])
 | 
						|
 | 
						|
        tm.assert_equal(res, exp)
 | 
						|
 | 
						|
    def test_categorical_missing_from_one_frame(self):
 | 
						|
        # GH 25412
 | 
						|
        df1 = DataFrame({"f1": [1, 2, 3]})
 | 
						|
        df2 = DataFrame({"f1": [2, 3, 1], "f2": Series([4, 4, 4]).astype("category")})
 | 
						|
        result = pd.concat([df1, df2], sort=True)
 | 
						|
        dtype = CategoricalDtype([4])
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "f1": [1, 2, 3, 2, 3, 1],
 | 
						|
                "f2": Categorical.from_codes([-1, -1, -1, 0, 0, 0], dtype=dtype),
 | 
						|
            },
 | 
						|
            index=[0, 1, 2, 0, 1, 2],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 |