400 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			400 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    DataFrame,
 | 
						|
    Series,
 | 
						|
    Timestamp,
 | 
						|
    date_range,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
 | 
						|
class TestDataFrameDescribe:
 | 
						|
    def test_describe_bool_in_mixed_frame(self):
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "string_data": ["a", "b", "c", "d", "e"],
 | 
						|
                "bool_data": [True, True, False, False, False],
 | 
						|
                "int_data": [10, 20, 30, 40, 50],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        # Integer data are included in .describe() output,
 | 
						|
        # Boolean and string data are not.
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
 | 
						|
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # Top value is a boolean value that is False
 | 
						|
        result = df.describe(include=["bool"])
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_empty_object(self):
 | 
						|
        # GH#27183
 | 
						|
        df = DataFrame({"A": [None, None]}, dtype=object)
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"A": [0, 0, np.nan, np.nan]},
 | 
						|
            dtype=object,
 | 
						|
            index=["count", "unique", "top", "freq"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = df.iloc[:0].describe()
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_bool_frame(self):
 | 
						|
        # GH#13891
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "bool_data_1": [False, False, True, True],
 | 
						|
                "bool_data_2": [False, True, True, True],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
 | 
						|
            index=["count", "unique", "top", "freq"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "bool_data": [False, False, True, True, False],
 | 
						|
                "int_data": [0, 1, 2, 3, 4],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
 | 
						|
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        df = DataFrame(
 | 
						|
            {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
 | 
						|
            index=["count", "unique", "top", "freq"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_categorical(self):
 | 
						|
        df = DataFrame({"value": np.random.randint(0, 10000, 100)})
 | 
						|
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
 | 
						|
        cat_labels = Categorical(labels, labels)
 | 
						|
 | 
						|
        df = df.sort_values(by=["value"], ascending=True)
 | 
						|
        df["value_group"] = pd.cut(
 | 
						|
            df.value, range(0, 10500, 500), right=False, labels=cat_labels
 | 
						|
        )
 | 
						|
        cat = df
 | 
						|
 | 
						|
        # Categoricals should not show up together with numerical columns
 | 
						|
        result = cat.describe()
 | 
						|
        assert len(result.columns) == 1
 | 
						|
 | 
						|
        # In a frame, describe() for the cat should be the same as for string
 | 
						|
        # arrays (count, unique, top, freq)
 | 
						|
 | 
						|
        cat = Categorical(
 | 
						|
            ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
 | 
						|
        )
 | 
						|
        s = Series(cat)
 | 
						|
        result = s.describe()
 | 
						|
        expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        cat = Series(Categorical(["a", "b", "c", "c"]))
 | 
						|
        df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
 | 
						|
        result = df3.describe()
 | 
						|
        tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
 | 
						|
 | 
						|
    def test_describe_empty_categorical_column(self):
 | 
						|
        # GH#26397
 | 
						|
        # Ensure the index of an empty categorical DataFrame column
 | 
						|
        # also contains (count, unique, top, freq)
 | 
						|
        df = DataFrame({"empty_col": Categorical([])})
 | 
						|
        result = df.describe()
 | 
						|
        expected = DataFrame(
 | 
						|
            {"empty_col": [0, 0, np.nan, np.nan]},
 | 
						|
            index=["count", "unique", "top", "freq"],
 | 
						|
            dtype="object",
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        # ensure NaN, not None
 | 
						|
        assert np.isnan(result.iloc[2, 0])
 | 
						|
        assert np.isnan(result.iloc[3, 0])
 | 
						|
 | 
						|
    def test_describe_categorical_columns(self):
 | 
						|
        # GH#11558
 | 
						|
        columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "int1": [10, 20, 30, 40, 50],
 | 
						|
                "int2": [10, 20, 30, 40, 50],
 | 
						|
                "obj": ["A", 0, None, "X", 1],
 | 
						|
            },
 | 
						|
            columns=columns,
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
 | 
						|
        exp_columns = pd.CategoricalIndex(
 | 
						|
            ["int1", "int2"],
 | 
						|
            categories=["int1", "int2", "obj"],
 | 
						|
            ordered=True,
 | 
						|
            name="XXX",
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
 | 
						|
                "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
 | 
						|
            },
 | 
						|
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
 | 
						|
            columns=exp_columns,
 | 
						|
        )
 | 
						|
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        tm.assert_categorical_equal(result.columns.values, expected.columns.values)
 | 
						|
 | 
						|
    def test_describe_datetime_columns(self):
 | 
						|
        columns = pd.DatetimeIndex(
 | 
						|
            ["2011-01-01", "2011-02-01", "2011-03-01"],
 | 
						|
            freq="MS",
 | 
						|
            tz="US/Eastern",
 | 
						|
            name="XXX",
 | 
						|
        )
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                0: [10, 20, 30, 40, 50],
 | 
						|
                1: [10, 20, 30, 40, 50],
 | 
						|
                2: ["A", 0, None, "X", 1],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        df.columns = columns
 | 
						|
        result = df.describe()
 | 
						|
 | 
						|
        exp_columns = pd.DatetimeIndex(
 | 
						|
            ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
 | 
						|
                1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
 | 
						|
            },
 | 
						|
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
 | 
						|
        )
 | 
						|
        expected.columns = exp_columns
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        assert result.columns.freq == "MS"
 | 
						|
        assert result.columns.tz == expected.columns.tz
 | 
						|
 | 
						|
    def test_describe_timedelta_values(self):
 | 
						|
        # GH#6145
 | 
						|
        t1 = pd.timedelta_range("1 days", freq="D", periods=5)
 | 
						|
        t2 = pd.timedelta_range("1 hours", freq="H", periods=5)
 | 
						|
        df = DataFrame({"t1": t1, "t2": t2})
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "t1": [
 | 
						|
                    5,
 | 
						|
                    pd.Timedelta("3 days"),
 | 
						|
                    df.iloc[:, 0].std(),
 | 
						|
                    pd.Timedelta("1 days"),
 | 
						|
                    pd.Timedelta("2 days"),
 | 
						|
                    pd.Timedelta("3 days"),
 | 
						|
                    pd.Timedelta("4 days"),
 | 
						|
                    pd.Timedelta("5 days"),
 | 
						|
                ],
 | 
						|
                "t2": [
 | 
						|
                    5,
 | 
						|
                    pd.Timedelta("3 hours"),
 | 
						|
                    df.iloc[:, 1].std(),
 | 
						|
                    pd.Timedelta("1 hours"),
 | 
						|
                    pd.Timedelta("2 hours"),
 | 
						|
                    pd.Timedelta("3 hours"),
 | 
						|
                    pd.Timedelta("4 hours"),
 | 
						|
                    pd.Timedelta("5 hours"),
 | 
						|
                ],
 | 
						|
            },
 | 
						|
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
 | 
						|
        )
 | 
						|
 | 
						|
        result = df.describe()
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        exp_repr = (
 | 
						|
            "                              t1                         t2\n"
 | 
						|
            "count                          5                          5\n"
 | 
						|
            "mean             3 days 00:00:00            0 days 03:00:00\n"
 | 
						|
            "std    1 days 13:56:50.394919273  0 days 01:34:52.099788303\n"
 | 
						|
            "min              1 days 00:00:00            0 days 01:00:00\n"
 | 
						|
            "25%              2 days 00:00:00            0 days 02:00:00\n"
 | 
						|
            "50%              3 days 00:00:00            0 days 03:00:00\n"
 | 
						|
            "75%              4 days 00:00:00            0 days 04:00:00\n"
 | 
						|
            "max              5 days 00:00:00            0 days 05:00:00"
 | 
						|
        )
 | 
						|
        assert repr(result) == exp_repr
 | 
						|
 | 
						|
    def test_describe_tz_values(self, tz_naive_fixture):
 | 
						|
        # GH#21332
 | 
						|
        tz = tz_naive_fixture
 | 
						|
        s1 = Series(range(5))
 | 
						|
        start = Timestamp(2018, 1, 1)
 | 
						|
        end = Timestamp(2018, 1, 5)
 | 
						|
        s2 = Series(date_range(start, end, tz=tz))
 | 
						|
        df = DataFrame({"s1": s1, "s2": s2})
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
 | 
						|
                "s2": [
 | 
						|
                    5,
 | 
						|
                    Timestamp(2018, 1, 3).tz_localize(tz),
 | 
						|
                    start.tz_localize(tz),
 | 
						|
                    s2[1],
 | 
						|
                    s2[2],
 | 
						|
                    s2[3],
 | 
						|
                    end.tz_localize(tz),
 | 
						|
                    np.nan,
 | 
						|
                ],
 | 
						|
            },
 | 
						|
            index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
 | 
						|
        )
 | 
						|
        result = df.describe(include="all", datetime_is_numeric=True)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_datetime_is_numeric_includes_datetime(self):
 | 
						|
        df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
 | 
						|
        result = df.describe(datetime_is_numeric=True)
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [
 | 
						|
                    3,
 | 
						|
                    Timestamp("2012-01-02"),
 | 
						|
                    Timestamp("2012-01-01"),
 | 
						|
                    Timestamp("2012-01-01T12:00:00"),
 | 
						|
                    Timestamp("2012-01-02"),
 | 
						|
                    Timestamp("2012-01-02T12:00:00"),
 | 
						|
                    Timestamp("2012-01-03"),
 | 
						|
                    np.nan,
 | 
						|
                ],
 | 
						|
                "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
 | 
						|
            },
 | 
						|
            index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_tz_values2(self):
 | 
						|
        tz = "CET"
 | 
						|
        s1 = Series(range(5))
 | 
						|
        start = Timestamp(2018, 1, 1)
 | 
						|
        end = Timestamp(2018, 1, 5)
 | 
						|
        s2 = Series(date_range(start, end, tz=tz))
 | 
						|
        df = DataFrame({"s1": s1, "s2": s2})
 | 
						|
 | 
						|
        s1_ = s1.describe()
 | 
						|
        s2_ = Series(
 | 
						|
            [
 | 
						|
                5,
 | 
						|
                5,
 | 
						|
                s2.value_counts().index[0],
 | 
						|
                1,
 | 
						|
                start.tz_localize(tz),
 | 
						|
                end.tz_localize(tz),
 | 
						|
            ],
 | 
						|
            index=["count", "unique", "top", "freq", "first", "last"],
 | 
						|
        )
 | 
						|
        idx = [
 | 
						|
            "count",
 | 
						|
            "unique",
 | 
						|
            "top",
 | 
						|
            "freq",
 | 
						|
            "first",
 | 
						|
            "last",
 | 
						|
            "mean",
 | 
						|
            "std",
 | 
						|
            "min",
 | 
						|
            "25%",
 | 
						|
            "50%",
 | 
						|
            "75%",
 | 
						|
            "max",
 | 
						|
        ]
 | 
						|
        expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
 | 
						|
 | 
						|
        with tm.assert_produces_warning(FutureWarning):
 | 
						|
            result = df.describe(include="all")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_percentiles_integer_idx(self):
 | 
						|
        # GH#26660
 | 
						|
        df = DataFrame({"x": [1]})
 | 
						|
        pct = np.linspace(0, 1, 10 + 1)
 | 
						|
        result = df.describe(percentiles=pct)
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]},
 | 
						|
            index=[
 | 
						|
                "count",
 | 
						|
                "mean",
 | 
						|
                "std",
 | 
						|
                "min",
 | 
						|
                "0%",
 | 
						|
                "10%",
 | 
						|
                "20%",
 | 
						|
                "30%",
 | 
						|
                "40%",
 | 
						|
                "50%",
 | 
						|
                "60%",
 | 
						|
                "70%",
 | 
						|
                "80%",
 | 
						|
                "90%",
 | 
						|
                "100%",
 | 
						|
                "max",
 | 
						|
            ],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_describe_does_not_raise_error_for_dictlike_elements(self):
 | 
						|
        # GH#32409
 | 
						|
        df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
 | 
						|
        expected = DataFrame(
 | 
						|
            {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
 | 
						|
    def test_describe_when_include_all_exclude_not_allowed(self, exclude):
 | 
						|
        """
 | 
						|
        When include is 'all', then setting exclude != None is not allowed.
 | 
						|
        """
 | 
						|
        df = DataFrame({"x": [1], "y": [2], "z": [3]})
 | 
						|
        msg = "exclude must be None when include is 'all'"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            df.describe(include="all", exclude=exclude)
 | 
						|
 | 
						|
    def test_describe_with_duplicate_columns(self):
 | 
						|
        df = DataFrame(
 | 
						|
            [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
 | 
						|
            columns=["bar", "a", "a"],
 | 
						|
            dtype="float64",
 | 
						|
        )
 | 
						|
        result = df.describe()
 | 
						|
        ser = df.iloc[:, 0].describe()
 | 
						|
        expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
 | 
						|
        tm.assert_frame_equal(result, expected)
 |