674 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			674 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
test all other .agg behavior
 | 
						|
"""
 | 
						|
 | 
						|
import datetime as dt
 | 
						|
from functools import partial
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    Index,
 | 
						|
    MultiIndex,
 | 
						|
    PeriodIndex,
 | 
						|
    Series,
 | 
						|
    date_range,
 | 
						|
    period_range,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
from pandas.core.base import SpecificationError
 | 
						|
 | 
						|
from pandas.io.formats.printing import pprint_thing
 | 
						|
 | 
						|
 | 
						|
def test_agg_api():
 | 
						|
    # GH 6337
 | 
						|
    # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
 | 
						|
    # different api for agg when passed custom function with mixed frame
 | 
						|
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "data1": np.random.randn(5),
 | 
						|
            "data2": np.random.randn(5),
 | 
						|
            "key1": ["a", "a", "b", "b", "a"],
 | 
						|
            "key2": ["one", "two", "one", "two", "one"],
 | 
						|
        }
 | 
						|
    )
 | 
						|
    grouped = df.groupby("key1")
 | 
						|
 | 
						|
    def peak_to_peak(arr):
 | 
						|
        return arr.max() - arr.min()
 | 
						|
 | 
						|
    with tm.assert_produces_warning(
 | 
						|
        FutureWarning,
 | 
						|
        match=r"\['key2'\] did not aggregate successfully",
 | 
						|
    ):
 | 
						|
        expected = grouped.agg([peak_to_peak])
 | 
						|
    expected.columns = ["data1", "data2"]
 | 
						|
 | 
						|
    with tm.assert_produces_warning(
 | 
						|
        FutureWarning,
 | 
						|
        match=r"\['key2'\] did not aggregate successfully",
 | 
						|
    ):
 | 
						|
        result = grouped.agg(peak_to_peak)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_datetimes_mixed():
 | 
						|
    data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
 | 
						|
 | 
						|
    df1 = DataFrame(
 | 
						|
        {
 | 
						|
            "key": [x[0] for x in data],
 | 
						|
            "date": [x[1] for x in data],
 | 
						|
            "value": [x[2] for x in data],
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    data = [
 | 
						|
        [
 | 
						|
            row[0],
 | 
						|
            (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
 | 
						|
            row[2],
 | 
						|
        ]
 | 
						|
        for row in data
 | 
						|
    ]
 | 
						|
 | 
						|
    df2 = DataFrame(
 | 
						|
        {
 | 
						|
            "key": [x[0] for x in data],
 | 
						|
            "date": [x[1] for x in data],
 | 
						|
            "value": [x[2] for x in data],
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    df1["weights"] = df1["value"] / df1["value"].sum()
 | 
						|
    gb1 = df1.groupby("date").aggregate(np.sum)
 | 
						|
 | 
						|
    df2["weights"] = df1["value"] / df1["value"].sum()
 | 
						|
    gb2 = df2.groupby("date").aggregate(np.sum)
 | 
						|
 | 
						|
    assert len(gb1) == len(gb2)
 | 
						|
 | 
						|
 | 
						|
def test_agg_period_index():
 | 
						|
    prng = period_range("2012-1-1", freq="M", periods=3)
 | 
						|
    df = DataFrame(np.random.randn(3, 2), index=prng)
 | 
						|
    rs = df.groupby(level=0).sum()
 | 
						|
    assert isinstance(rs.index, PeriodIndex)
 | 
						|
 | 
						|
    # GH 3579
 | 
						|
    index = period_range(start="1999-01", periods=5, freq="M")
 | 
						|
    s1 = Series(np.random.rand(len(index)), index=index)
 | 
						|
    s2 = Series(np.random.rand(len(index)), index=index)
 | 
						|
    df = DataFrame.from_dict({"s1": s1, "s2": s2})
 | 
						|
    grouped = df.groupby(df.index.month)
 | 
						|
    list(grouped)
 | 
						|
 | 
						|
 | 
						|
def test_agg_dict_parameter_cast_result_dtypes():
 | 
						|
    # GH 12821
 | 
						|
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "class": ["A", "A", "B", "B", "C", "C", "D", "D"],
 | 
						|
            "time": date_range("1/1/2011", periods=8, freq="H"),
 | 
						|
        }
 | 
						|
    )
 | 
						|
    df.loc[[0, 1, 2, 5], "time"] = None
 | 
						|
 | 
						|
    # test for `first` function
 | 
						|
    exp = df.loc[[0, 3, 4, 6]].set_index("class")
 | 
						|
    grouped = df.groupby("class")
 | 
						|
    tm.assert_frame_equal(grouped.first(), exp)
 | 
						|
    tm.assert_frame_equal(grouped.agg("first"), exp)
 | 
						|
    tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
 | 
						|
    tm.assert_series_equal(grouped.time.first(), exp["time"])
 | 
						|
    tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
 | 
						|
 | 
						|
    # test for `last` function
 | 
						|
    exp = df.loc[[0, 3, 4, 7]].set_index("class")
 | 
						|
    grouped = df.groupby("class")
 | 
						|
    tm.assert_frame_equal(grouped.last(), exp)
 | 
						|
    tm.assert_frame_equal(grouped.agg("last"), exp)
 | 
						|
    tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
 | 
						|
    tm.assert_series_equal(grouped.time.last(), exp["time"])
 | 
						|
    tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
 | 
						|
 | 
						|
    # count
 | 
						|
    exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
 | 
						|
    tm.assert_series_equal(grouped.time.agg(len), exp)
 | 
						|
    tm.assert_series_equal(grouped.time.size(), exp)
 | 
						|
 | 
						|
    exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
 | 
						|
    tm.assert_series_equal(grouped.time.count(), exp)
 | 
						|
 | 
						|
 | 
						|
def test_agg_cast_results_dtypes():
 | 
						|
    # similar to GH12821
 | 
						|
    # xref #11444
 | 
						|
    u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
 | 
						|
    v = list("aaabbbbbbccd")
 | 
						|
    df = DataFrame({"X": v, "Y": u})
 | 
						|
 | 
						|
    result = df.groupby("X")["Y"].agg(len)
 | 
						|
    expected = df.groupby("X")["Y"].count()
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_aggregate_float64_no_int64():
 | 
						|
    # see gh-11199
 | 
						|
    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
 | 
						|
 | 
						|
    expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
 | 
						|
    expected.index.name = "b"
 | 
						|
 | 
						|
    result = df.groupby("b")[["a"]].mean()
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
 | 
						|
    expected.index.name = "b"
 | 
						|
 | 
						|
    result = df.groupby("b")[["a", "c"]].mean()
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_aggregate_api_consistency():
 | 
						|
    # GH 9052
 | 
						|
    # make sure that the aggregates via dict
 | 
						|
    # are consistent
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
 | 
						|
            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
 | 
						|
            "C": np.random.randn(8) + 1.0,
 | 
						|
            "D": np.arange(8),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    grouped = df.groupby(["A", "B"])
 | 
						|
    c_mean = grouped["C"].mean()
 | 
						|
    c_sum = grouped["C"].sum()
 | 
						|
    d_mean = grouped["D"].mean()
 | 
						|
    d_sum = grouped["D"].sum()
 | 
						|
 | 
						|
    result = grouped["D"].agg(["sum", "mean"])
 | 
						|
    expected = pd.concat([d_sum, d_mean], axis=1)
 | 
						|
    expected.columns = ["sum", "mean"]
 | 
						|
    tm.assert_frame_equal(result, expected, check_like=True)
 | 
						|
 | 
						|
    result = grouped.agg([np.sum, np.mean])
 | 
						|
    expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
 | 
						|
    expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
 | 
						|
    tm.assert_frame_equal(result, expected, check_like=True)
 | 
						|
 | 
						|
    result = grouped[["D", "C"]].agg([np.sum, np.mean])
 | 
						|
    expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
 | 
						|
    expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
 | 
						|
    tm.assert_frame_equal(result, expected, check_like=True)
 | 
						|
 | 
						|
    result = grouped.agg({"C": "mean", "D": "sum"})
 | 
						|
    expected = pd.concat([d_sum, c_mean], axis=1)
 | 
						|
    tm.assert_frame_equal(result, expected, check_like=True)
 | 
						|
 | 
						|
    result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
 | 
						|
    expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
 | 
						|
    expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
 | 
						|
 | 
						|
    msg = r"Column\(s\) \['r', 'r2'\] do not exist"
 | 
						|
    with pytest.raises(KeyError, match=msg):
 | 
						|
        grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
 | 
						|
 | 
						|
 | 
						|
def test_agg_dict_renaming_deprecation():
 | 
						|
    # 15931
 | 
						|
    df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
 | 
						|
 | 
						|
    msg = r"nested renamer is not supported"
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        df.groupby("A").agg(
 | 
						|
            {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
 | 
						|
        )
 | 
						|
 | 
						|
    msg = r"Column\(s\) \['ma'\] do not exist"
 | 
						|
    with pytest.raises(KeyError, match=msg):
 | 
						|
        df.groupby("A")[["B", "C"]].agg({"ma": "max"})
 | 
						|
 | 
						|
    msg = r"nested renamer is not supported"
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        df.groupby("A").B.agg({"foo": "count"})
 | 
						|
 | 
						|
 | 
						|
def test_agg_compat():
 | 
						|
    # GH 12334
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
 | 
						|
            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
 | 
						|
            "C": np.random.randn(8) + 1.0,
 | 
						|
            "D": np.arange(8),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    g = df.groupby(["A", "B"])
 | 
						|
 | 
						|
    msg = r"nested renamer is not supported"
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g["D"].agg({"C": ["sum", "std"]})
 | 
						|
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g["D"].agg({"C": "sum", "D": "std"})
 | 
						|
 | 
						|
 | 
						|
def test_agg_nested_dicts():
 | 
						|
    # API change for disallowing these types of nested dicts
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
 | 
						|
            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
 | 
						|
            "C": np.random.randn(8) + 1.0,
 | 
						|
            "D": np.arange(8),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    g = df.groupby(["A", "B"])
 | 
						|
 | 
						|
    msg = r"nested renamer is not supported"
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
 | 
						|
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
 | 
						|
 | 
						|
    # same name as the original column
 | 
						|
    # GH9052
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g["D"].agg({"result1": np.sum, "result2": np.mean})
 | 
						|
 | 
						|
    with pytest.raises(SpecificationError, match=msg):
 | 
						|
        g["D"].agg({"D": np.sum, "result2": np.mean})
 | 
						|
 | 
						|
 | 
						|
def test_agg_item_by_item_raise_typeerror():
 | 
						|
    df = DataFrame(np.random.randint(10, size=(20, 10)))
 | 
						|
 | 
						|
    def raiseException(df):
 | 
						|
        pprint_thing("----------------------------------------")
 | 
						|
        pprint_thing(df.to_string())
 | 
						|
        raise TypeError("test")
 | 
						|
 | 
						|
    with pytest.raises(TypeError, match="test"):
 | 
						|
        with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
 | 
						|
            df.groupby(0).agg(raiseException)
 | 
						|
 | 
						|
 | 
						|
def test_series_agg_multikey():
 | 
						|
    ts = tm.makeTimeSeries()
 | 
						|
    grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
 | 
						|
 | 
						|
    result = grouped.agg(np.sum)
 | 
						|
    expected = grouped.sum()
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_series_agg_multi_pure_python():
 | 
						|
    data = DataFrame(
 | 
						|
        {
 | 
						|
            "A": [
 | 
						|
                "foo",
 | 
						|
                "foo",
 | 
						|
                "foo",
 | 
						|
                "foo",
 | 
						|
                "bar",
 | 
						|
                "bar",
 | 
						|
                "bar",
 | 
						|
                "bar",
 | 
						|
                "foo",
 | 
						|
                "foo",
 | 
						|
                "foo",
 | 
						|
            ],
 | 
						|
            "B": [
 | 
						|
                "one",
 | 
						|
                "one",
 | 
						|
                "one",
 | 
						|
                "two",
 | 
						|
                "one",
 | 
						|
                "one",
 | 
						|
                "one",
 | 
						|
                "two",
 | 
						|
                "two",
 | 
						|
                "two",
 | 
						|
                "one",
 | 
						|
            ],
 | 
						|
            "C": [
 | 
						|
                "dull",
 | 
						|
                "dull",
 | 
						|
                "shiny",
 | 
						|
                "dull",
 | 
						|
                "dull",
 | 
						|
                "shiny",
 | 
						|
                "shiny",
 | 
						|
                "dull",
 | 
						|
                "shiny",
 | 
						|
                "shiny",
 | 
						|
                "shiny",
 | 
						|
            ],
 | 
						|
            "D": np.random.randn(11),
 | 
						|
            "E": np.random.randn(11),
 | 
						|
            "F": np.random.randn(11),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    def bad(x):
 | 
						|
        assert len(x.values.base) > 0
 | 
						|
        return "foo"
 | 
						|
 | 
						|
    result = data.groupby(["A", "B"]).agg(bad)
 | 
						|
    expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_consistency():
 | 
						|
    # agg with ([]) and () not consistent
 | 
						|
    # GH 6715
 | 
						|
    def P1(a):
 | 
						|
        return np.percentile(a.dropna(), q=1)
 | 
						|
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "col1": [1, 2, 3, 4],
 | 
						|
            "col2": [10, 25, 26, 31],
 | 
						|
            "date": [
 | 
						|
                dt.date(2013, 2, 10),
 | 
						|
                dt.date(2013, 2, 10),
 | 
						|
                dt.date(2013, 2, 11),
 | 
						|
                dt.date(2013, 2, 11),
 | 
						|
            ],
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    g = df.groupby("date")
 | 
						|
 | 
						|
    expected = g.agg([P1])
 | 
						|
    expected.columns = expected.columns.levels[0]
 | 
						|
 | 
						|
    result = g.agg(P1)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_callables():
 | 
						|
    # GH 7929
 | 
						|
    df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
 | 
						|
 | 
						|
    class fn_class:
 | 
						|
        def __call__(self, x):
 | 
						|
            return sum(x)
 | 
						|
 | 
						|
    equiv_callables = [
 | 
						|
        sum,
 | 
						|
        np.sum,
 | 
						|
        lambda x: sum(x),
 | 
						|
        lambda x: x.sum(),
 | 
						|
        partial(sum),
 | 
						|
        fn_class(),
 | 
						|
    ]
 | 
						|
 | 
						|
    expected = df.groupby("foo").agg(sum)
 | 
						|
    for ecall in equiv_callables:
 | 
						|
        result = df.groupby("foo").agg(ecall)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_over_numpy_arrays():
 | 
						|
    # GH 3788
 | 
						|
    df = DataFrame(
 | 
						|
        [
 | 
						|
            [1, np.array([10, 20, 30])],
 | 
						|
            [1, np.array([40, 50, 60])],
 | 
						|
            [2, np.array([20, 30, 40])],
 | 
						|
        ],
 | 
						|
        columns=["category", "arraydata"],
 | 
						|
    )
 | 
						|
    gb = df.groupby("category")
 | 
						|
 | 
						|
    expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
 | 
						|
    expected_index = Index([1, 2], name="category")
 | 
						|
    expected_column = ["arraydata"]
 | 
						|
    expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
 | 
						|
 | 
						|
    alt = gb.sum(numeric_only=False)
 | 
						|
    tm.assert_frame_equal(alt, expected)
 | 
						|
 | 
						|
    result = gb.agg("sum", numeric_only=False)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    # FIXME: the original version of this test called `gb.agg(sum)`
 | 
						|
    #  and that raises TypeError if `numeric_only=False` is passed
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("as_period", [True, False])
 | 
						|
def test_agg_tzaware_non_datetime_result(as_period):
 | 
						|
    # discussed in GH#29589, fixed in GH#29641, operating on tzaware values
 | 
						|
    #  with function that is not dtype-preserving
 | 
						|
    dti = date_range("2012-01-01", periods=4, tz="UTC")
 | 
						|
    if as_period:
 | 
						|
        dti = dti.tz_localize(None).to_period("D")
 | 
						|
 | 
						|
    df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
 | 
						|
    gb = df.groupby("a")
 | 
						|
 | 
						|
    # Case that _does_ preserve the dtype
 | 
						|
    result = gb["b"].agg(lambda x: x.iloc[0])
 | 
						|
    expected = Series(dti[::2], name="b")
 | 
						|
    expected.index.name = "a"
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    # Cases that do _not_ preserve the dtype
 | 
						|
    result = gb["b"].agg(lambda x: x.iloc[0].year)
 | 
						|
    expected = Series([2012, 2012], name="b")
 | 
						|
    expected.index.name = "a"
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
 | 
						|
    expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
 | 
						|
    expected.index.name = "a"
 | 
						|
    if as_period:
 | 
						|
        expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
 | 
						|
        expected.index.name = "a"
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_timezone_round_trip():
 | 
						|
    # GH 15426
 | 
						|
    ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
 | 
						|
    df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
 | 
						|
 | 
						|
    result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
 | 
						|
    result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
 | 
						|
    result3 = df.groupby("a")["b"].min().iloc[0]
 | 
						|
 | 
						|
    assert result1 == ts
 | 
						|
    assert result2 == ts
 | 
						|
    assert result3 == ts
 | 
						|
 | 
						|
    dates = [
 | 
						|
        pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
 | 
						|
    ]
 | 
						|
    df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
 | 
						|
    grouped = df.groupby("A")
 | 
						|
 | 
						|
    ts = df["B"].iloc[0]
 | 
						|
    assert ts == grouped.nth(0)["B"].iloc[0]
 | 
						|
    assert ts == grouped.head(1)["B"].iloc[0]
 | 
						|
    assert ts == grouped.first()["B"].iloc[0]
 | 
						|
 | 
						|
    # GH#27110 applying iloc should return a DataFrame
 | 
						|
    assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
 | 
						|
 | 
						|
    ts = df["B"].iloc[2]
 | 
						|
    assert ts == grouped.last()["B"].iloc[0]
 | 
						|
 | 
						|
    # GH#27110 applying iloc should return a DataFrame
 | 
						|
    assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
 | 
						|
 | 
						|
 | 
						|
def test_sum_uint64_overflow():
 | 
						|
    # see gh-14758
 | 
						|
    # Convert to uint64 and don't overflow
 | 
						|
    df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
 | 
						|
    df = df + 9223372036854775807
 | 
						|
 | 
						|
    index = Index(
 | 
						|
        [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
 | 
						|
    )
 | 
						|
    expected = DataFrame(
 | 
						|
        {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
 | 
						|
        index=index,
 | 
						|
    )
 | 
						|
 | 
						|
    expected.index.name = 0
 | 
						|
    result = df.groupby(0).sum(numeric_only=False)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    # out column is non-numeric, so with numeric_only=True it is dropped
 | 
						|
    result2 = df.groupby(0).sum(numeric_only=True)
 | 
						|
    expected2 = expected[[]]
 | 
						|
    tm.assert_frame_equal(result2, expected2)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "structure, expected",
 | 
						|
    [
 | 
						|
        (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
 | 
						|
        (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
 | 
						|
        (
 | 
						|
            lambda x: tuple(x),
 | 
						|
            DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
 | 
						|
        ),
 | 
						|
        (
 | 
						|
            lambda x: list(x),
 | 
						|
            DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_agg_structs_dataframe(structure, expected):
 | 
						|
    df = DataFrame(
 | 
						|
        {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
 | 
						|
    )
 | 
						|
 | 
						|
    result = df.groupby(["A", "B"]).aggregate(structure)
 | 
						|
    expected.index.names = ["A", "B"]
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "structure, expected",
 | 
						|
    [
 | 
						|
        (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
 | 
						|
        (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
 | 
						|
        (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
 | 
						|
        (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_agg_structs_series(structure, expected):
 | 
						|
    # Issue #18079
 | 
						|
    df = DataFrame(
 | 
						|
        {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
 | 
						|
    )
 | 
						|
 | 
						|
    result = df.groupby("A")["C"].aggregate(structure)
 | 
						|
    expected.index.name = "A"
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_category_nansum(observed):
 | 
						|
    categories = ["a", "b", "c"]
 | 
						|
    df = DataFrame(
 | 
						|
        {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
 | 
						|
    )
 | 
						|
    result = df.groupby("A", observed=observed).B.agg(np.nansum)
 | 
						|
    expected = Series(
 | 
						|
        [3, 3, 0],
 | 
						|
        index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
 | 
						|
        name="B",
 | 
						|
    )
 | 
						|
    if observed:
 | 
						|
        expected = expected[expected != 0]
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_list_like_func():
 | 
						|
    # GH 18473
 | 
						|
    df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
 | 
						|
    grouped = df.groupby("A", as_index=False, sort=False)
 | 
						|
    result = grouped.agg({"B": lambda x: list(x)})
 | 
						|
    expected = DataFrame(
 | 
						|
        {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_agg_lambda_with_timezone():
 | 
						|
    # GH 23683
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "tag": [1, 1],
 | 
						|
            "date": [
 | 
						|
                pd.Timestamp("2018-01-01", tz="UTC"),
 | 
						|
                pd.Timestamp("2018-01-02", tz="UTC"),
 | 
						|
            ],
 | 
						|
        }
 | 
						|
    )
 | 
						|
    result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
 | 
						|
    expected = DataFrame(
 | 
						|
        [pd.Timestamp("2018-01-01", tz="UTC")],
 | 
						|
        index=Index([1], name="tag"),
 | 
						|
        columns=["date"],
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "err_cls",
 | 
						|
    [
 | 
						|
        NotImplementedError,
 | 
						|
        RuntimeError,
 | 
						|
        KeyError,
 | 
						|
        IndexError,
 | 
						|
        OSError,
 | 
						|
        ValueError,
 | 
						|
        ArithmeticError,
 | 
						|
        AttributeError,
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_groupby_agg_err_catching(err_cls):
 | 
						|
    # make sure we suppress anything other than TypeError or AssertionError
 | 
						|
    #  in _python_agg_general
 | 
						|
 | 
						|
    # Use a non-standard EA to make sure we don't go down ndarray paths
 | 
						|
    from pandas.tests.extension.decimal.array import (
 | 
						|
        DecimalArray,
 | 
						|
        make_data,
 | 
						|
        to_decimal,
 | 
						|
    )
 | 
						|
 | 
						|
    data = make_data()[:5]
 | 
						|
    df = DataFrame(
 | 
						|
        {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
 | 
						|
    )
 | 
						|
 | 
						|
    expected = Series(to_decimal([data[0], data[3]]))
 | 
						|
 | 
						|
    def weird_func(x):
 | 
						|
        # weird function that raise something other than TypeError or IndexError
 | 
						|
        #  in _python_agg_general
 | 
						|
        if len(x) == 0:
 | 
						|
            raise err_cls
 | 
						|
        return x.iloc[0]
 | 
						|
 | 
						|
    result = df["decimals"].groupby(df["id1"]).agg(weird_func)
 | 
						|
    tm.assert_series_equal(result, expected, check_names=False)
 |