358 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			358 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from datetime import datetime
 | 
						|
from operator import methodcaller
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    Series,
 | 
						|
    Timestamp,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
from pandas.core.groupby.grouper import Grouper
 | 
						|
from pandas.core.indexes.datetimes import date_range
 | 
						|
 | 
						|
test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000))
 | 
						|
 | 
						|
 | 
						|
def test_apply():
 | 
						|
    grouper = Grouper(freq="A", label="right", closed="right")
 | 
						|
 | 
						|
    grouped = test_series.groupby(grouper)
 | 
						|
 | 
						|
    def f(x):
 | 
						|
        return x.sort_values()[-3:]
 | 
						|
 | 
						|
    applied = grouped.apply(f)
 | 
						|
    expected = test_series.groupby(lambda x: x.year).apply(f)
 | 
						|
 | 
						|
    applied.index = applied.index.droplevel(0)
 | 
						|
    expected.index = expected.index.droplevel(0)
 | 
						|
    tm.assert_series_equal(applied, expected)
 | 
						|
 | 
						|
 | 
						|
def test_count():
 | 
						|
    test_series[::3] = np.nan
 | 
						|
 | 
						|
    expected = test_series.groupby(lambda x: x.year).count()
 | 
						|
 | 
						|
    grouper = Grouper(freq="A", label="right", closed="right")
 | 
						|
    result = test_series.groupby(grouper).count()
 | 
						|
    expected.index = result.index
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    result = test_series.resample("A").count()
 | 
						|
    expected.index = result.index
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_numpy_reduction():
 | 
						|
    result = test_series.resample("A", closed="right").prod()
 | 
						|
 | 
						|
    expected = test_series.groupby(lambda x: x.year).agg(np.prod)
 | 
						|
    expected.index = result.index
 | 
						|
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_apply_iteration():
 | 
						|
    # #2300
 | 
						|
    N = 1000
 | 
						|
    ind = date_range(start="2000-01-01", freq="D", periods=N)
 | 
						|
    df = DataFrame({"open": 1, "close": 2}, index=ind)
 | 
						|
    tg = Grouper(freq="M")
 | 
						|
 | 
						|
    _, grouper, _ = tg._get_grouper(df)
 | 
						|
 | 
						|
    # Errors
 | 
						|
    grouped = df.groupby(grouper, group_keys=False)
 | 
						|
 | 
						|
    def f(df):
 | 
						|
        return df["close"] / df["open"]
 | 
						|
 | 
						|
    # it works!
 | 
						|
    result = grouped.apply(f)
 | 
						|
    tm.assert_index_equal(result.index, df.index)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "name, func",
 | 
						|
    [
 | 
						|
        ("Int64Index", tm.makeIntIndex),
 | 
						|
        ("Index", tm.makeUnicodeIndex),
 | 
						|
        ("Float64Index", tm.makeFloatIndex),
 | 
						|
        ("MultiIndex", lambda m: tm.makeCustomIndex(m, 2)),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_fails_on_no_datetime_index(name, func):
 | 
						|
    n = 2
 | 
						|
    index = func(n)
 | 
						|
    df = DataFrame({"a": np.random.randn(n)}, index=index)
 | 
						|
 | 
						|
    msg = (
 | 
						|
        "Only valid with DatetimeIndex, TimedeltaIndex "
 | 
						|
        f"or PeriodIndex, but got an instance of '{name}'"
 | 
						|
    )
 | 
						|
    with pytest.raises(TypeError, match=msg):
 | 
						|
        df.groupby(Grouper(freq="D"))
 | 
						|
 | 
						|
 | 
						|
def test_aaa_group_order():
 | 
						|
    # GH 12840
 | 
						|
    # check TimeGrouper perform stable sorts
 | 
						|
    n = 20
 | 
						|
    data = np.random.randn(n, 4)
 | 
						|
    df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    df["key"] = [
 | 
						|
        datetime(2013, 1, 1),
 | 
						|
        datetime(2013, 1, 2),
 | 
						|
        datetime(2013, 1, 3),
 | 
						|
        datetime(2013, 1, 4),
 | 
						|
        datetime(2013, 1, 5),
 | 
						|
    ] * 4
 | 
						|
    grouped = df.groupby(Grouper(key="key", freq="D"))
 | 
						|
 | 
						|
    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5])
 | 
						|
    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5])
 | 
						|
    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5])
 | 
						|
    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5])
 | 
						|
    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
 | 
						|
 | 
						|
 | 
						|
def test_aggregate_normal(resample_method):
 | 
						|
    """Check TimeGrouper's aggregation is identical as normal groupby."""
 | 
						|
 | 
						|
    data = np.random.randn(20, 4)
 | 
						|
    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    normal_df["key"] = [1, 2, 3, 4, 5] * 4
 | 
						|
 | 
						|
    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    dt_df["key"] = [
 | 
						|
        datetime(2013, 1, 1),
 | 
						|
        datetime(2013, 1, 2),
 | 
						|
        datetime(2013, 1, 3),
 | 
						|
        datetime(2013, 1, 4),
 | 
						|
        datetime(2013, 1, 5),
 | 
						|
    ] * 4
 | 
						|
 | 
						|
    normal_grouped = normal_df.groupby("key")
 | 
						|
    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
 | 
						|
 | 
						|
    expected = getattr(normal_grouped, resample_method)()
 | 
						|
    dt_result = getattr(dt_grouped, resample_method)()
 | 
						|
    expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key")
 | 
						|
    tm.assert_equal(expected, dt_result)
 | 
						|
 | 
						|
    # if TimeGrouper is used included, 'nth' doesn't work yet
 | 
						|
 | 
						|
    """
 | 
						|
    for func in ['nth']:
 | 
						|
        expected = getattr(normal_grouped, func)(3)
 | 
						|
        expected.index = date_range(start='2013-01-01',
 | 
						|
                                    freq='D', periods=5, name='key')
 | 
						|
        dt_result = getattr(dt_grouped, func)(3)
 | 
						|
        tm.assert_frame_equal(expected, dt_result)
 | 
						|
    """
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "method, method_args, unit",
 | 
						|
    [
 | 
						|
        ("sum", {}, 0),
 | 
						|
        ("sum", {"min_count": 0}, 0),
 | 
						|
        ("sum", {"min_count": 1}, np.nan),
 | 
						|
        ("prod", {}, 1),
 | 
						|
        ("prod", {"min_count": 0}, 1),
 | 
						|
        ("prod", {"min_count": 1}, np.nan),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_resample_entirely_nat_window(method, method_args, unit):
 | 
						|
    s = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4))
 | 
						|
    result = methodcaller(method, **method_args)(s.resample("2d"))
 | 
						|
    expected = Series(
 | 
						|
        [0.0, unit], index=pd.DatetimeIndex(["2017-01-01", "2017-01-03"], freq="2D")
 | 
						|
    )
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "func, fill_value",
 | 
						|
    [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)],
 | 
						|
)
 | 
						|
def test_aggregate_with_nat(func, fill_value):
 | 
						|
    # check TimeGrouper's aggregation is identical as normal groupby
 | 
						|
    # if NaT is included, 'var', 'std', 'mean', 'first','last'
 | 
						|
    # and 'nth' doesn't work yet
 | 
						|
 | 
						|
    n = 20
 | 
						|
    data = np.random.randn(n, 4).astype("int64")
 | 
						|
    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
 | 
						|
 | 
						|
    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    dt_df["key"] = [
 | 
						|
        datetime(2013, 1, 1),
 | 
						|
        datetime(2013, 1, 2),
 | 
						|
        pd.NaT,
 | 
						|
        datetime(2013, 1, 4),
 | 
						|
        datetime(2013, 1, 5),
 | 
						|
    ] * 4
 | 
						|
 | 
						|
    normal_grouped = normal_df.groupby("key")
 | 
						|
    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
 | 
						|
 | 
						|
    normal_result = getattr(normal_grouped, func)()
 | 
						|
    dt_result = getattr(dt_grouped, func)()
 | 
						|
 | 
						|
    pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"])
 | 
						|
    expected = pd.concat([normal_result, pad])
 | 
						|
    expected = expected.sort_index()
 | 
						|
    dti = date_range(start="2013-01-01", freq="D", periods=5, name="key")
 | 
						|
    expected.index = dti._with_freq(None)  # TODO: is this desired?
 | 
						|
    tm.assert_frame_equal(expected, dt_result)
 | 
						|
    assert dt_result.index.name == "key"
 | 
						|
 | 
						|
 | 
						|
def test_aggregate_with_nat_size():
 | 
						|
    # GH 9925
 | 
						|
    n = 20
 | 
						|
    data = np.random.randn(n, 4).astype("int64")
 | 
						|
    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
 | 
						|
 | 
						|
    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
 | 
						|
    dt_df["key"] = [
 | 
						|
        datetime(2013, 1, 1),
 | 
						|
        datetime(2013, 1, 2),
 | 
						|
        pd.NaT,
 | 
						|
        datetime(2013, 1, 4),
 | 
						|
        datetime(2013, 1, 5),
 | 
						|
    ] * 4
 | 
						|
 | 
						|
    normal_grouped = normal_df.groupby("key")
 | 
						|
    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
 | 
						|
 | 
						|
    normal_result = normal_grouped.size()
 | 
						|
    dt_result = dt_grouped.size()
 | 
						|
 | 
						|
    pad = Series([0], index=[3])
 | 
						|
    expected = pd.concat([normal_result, pad])
 | 
						|
    expected = expected.sort_index()
 | 
						|
    expected.index = date_range(
 | 
						|
        start="2013-01-01", freq="D", periods=5, name="key"
 | 
						|
    )._with_freq(None)
 | 
						|
    tm.assert_series_equal(expected, dt_result)
 | 
						|
    assert dt_result.index.name == "key"
 | 
						|
 | 
						|
 | 
						|
def test_repr():
 | 
						|
    # GH18203
 | 
						|
    result = repr(Grouper(key="A", freq="H"))
 | 
						|
    expected = (
 | 
						|
        "TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
 | 
						|
        "closed='left', label='left', how='mean', "
 | 
						|
        "convention='e', origin='start_day')"
 | 
						|
    )
 | 
						|
    assert result == expected
 | 
						|
 | 
						|
    result = repr(Grouper(key="A", freq="H", origin="2000-01-01"))
 | 
						|
    expected = (
 | 
						|
        "TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
 | 
						|
        "closed='left', label='left', how='mean', "
 | 
						|
        "convention='e', origin=Timestamp('2000-01-01 00:00:00'))"
 | 
						|
    )
 | 
						|
    assert result == expected
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "method, method_args, expected_values",
 | 
						|
    [
 | 
						|
        ("sum", {}, [1, 0, 1]),
 | 
						|
        ("sum", {"min_count": 0}, [1, 0, 1]),
 | 
						|
        ("sum", {"min_count": 1}, [1, np.nan, 1]),
 | 
						|
        ("sum", {"min_count": 2}, [np.nan, np.nan, np.nan]),
 | 
						|
        ("prod", {}, [1, 1, 1]),
 | 
						|
        ("prod", {"min_count": 0}, [1, 1, 1]),
 | 
						|
        ("prod", {"min_count": 1}, [1, np.nan, 1]),
 | 
						|
        ("prod", {"min_count": 2}, [np.nan, np.nan, np.nan]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_upsample_sum(method, method_args, expected_values):
 | 
						|
    s = Series(1, index=date_range("2017", periods=2, freq="H"))
 | 
						|
    resampled = s.resample("30T")
 | 
						|
    index = pd.DatetimeIndex(
 | 
						|
        ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"],
 | 
						|
        freq="30T",
 | 
						|
    )
 | 
						|
    result = methodcaller(method, **method_args)(resampled)
 | 
						|
    expected = Series(expected_values, index=index)
 | 
						|
    tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_groupby_resample_interpolate():
 | 
						|
    # GH 35325
 | 
						|
    d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
 | 
						|
 | 
						|
    df = DataFrame(d)
 | 
						|
 | 
						|
    df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
 | 
						|
 | 
						|
    result = (
 | 
						|
        df.set_index("week_starting")
 | 
						|
        .groupby("volume")
 | 
						|
        .resample("1D")
 | 
						|
        .interpolate(method="linear")
 | 
						|
    )
 | 
						|
 | 
						|
    msg = "containing strings is deprecated"
 | 
						|
    with tm.assert_produces_warning(FutureWarning, match=msg):
 | 
						|
        expected_ind = pd.MultiIndex.from_tuples(
 | 
						|
            [
 | 
						|
                (50, "2018-01-07"),
 | 
						|
                (50, Timestamp("2018-01-08")),
 | 
						|
                (50, Timestamp("2018-01-09")),
 | 
						|
                (50, Timestamp("2018-01-10")),
 | 
						|
                (50, Timestamp("2018-01-11")),
 | 
						|
                (50, Timestamp("2018-01-12")),
 | 
						|
                (50, Timestamp("2018-01-13")),
 | 
						|
                (50, Timestamp("2018-01-14")),
 | 
						|
                (50, Timestamp("2018-01-15")),
 | 
						|
                (50, Timestamp("2018-01-16")),
 | 
						|
                (50, Timestamp("2018-01-17")),
 | 
						|
                (50, Timestamp("2018-01-18")),
 | 
						|
                (50, Timestamp("2018-01-19")),
 | 
						|
                (50, Timestamp("2018-01-20")),
 | 
						|
                (50, Timestamp("2018-01-21")),
 | 
						|
                (60, Timestamp("2018-01-14")),
 | 
						|
            ],
 | 
						|
            names=["volume", "week_starting"],
 | 
						|
        )
 | 
						|
 | 
						|
    expected = DataFrame(
 | 
						|
        data={
 | 
						|
            "price": [
 | 
						|
                10.0,
 | 
						|
                9.928571428571429,
 | 
						|
                9.857142857142858,
 | 
						|
                9.785714285714286,
 | 
						|
                9.714285714285714,
 | 
						|
                9.642857142857142,
 | 
						|
                9.571428571428571,
 | 
						|
                9.5,
 | 
						|
                9.428571428571429,
 | 
						|
                9.357142857142858,
 | 
						|
                9.285714285714286,
 | 
						|
                9.214285714285714,
 | 
						|
                9.142857142857142,
 | 
						|
                9.071428571428571,
 | 
						|
                9.0,
 | 
						|
                11.0,
 | 
						|
            ],
 | 
						|
            "volume": [50.0] * 15 + [60],
 | 
						|
        },
 | 
						|
        index=expected_ind,
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 |