Files
Feature-Extraction/dist/client/pandas/core/groupby/generic.py

1796 lines
60 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Define the SeriesGroupBy and DataFrameGroupBy
classes that hold the groupby interfaces (and some implementations).
These are user facing as the result of the ``df.groupby(...)`` operations,
which here returns a DataFrameGroupBy object.
"""
from __future__ import annotations
from collections import abc
from functools import partial
from textwrap import dedent
from typing import (
Any,
Callable,
Hashable,
Iterable,
Mapping,
NamedTuple,
Sequence,
TypeVar,
Union,
cast,
)
import warnings
import numpy as np
from pandas._libs import reduction as libreduction
from pandas._typing import (
ArrayLike,
Manager,
Manager2D,
SingleManager,
)
from pandas.util._decorators import (
Appender,
Substitution,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
ensure_int64,
is_bool,
is_categorical_dtype,
is_dict_like,
is_integer_dtype,
is_interval_dtype,
is_scalar,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms,
nanops,
)
from pandas.core.apply import (
GroupByApply,
maybe_mangle_lambdas,
reconstruct_func,
validate_func_kwargs,
)
from pandas.core.base import SpecificationError
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base
from pandas.core.groupby.groupby import (
GroupBy,
_agg_template,
_apply_docs,
_transform_template,
warn_dropping_nuisance_columns_deprecated,
)
from pandas.core.groupby.grouper import get_grouper
from pandas.core.indexes.api import (
Index,
MultiIndex,
all_indexes_same,
)
from pandas.core.series import Series
from pandas.core.util.numba_ import maybe_use_numba
from pandas.plotting import boxplot_frame_groupby
# TODO(typing) the return value on this callable should be any *scalar*.
AggScalar = Union[str, Callable[..., Any]]
# TODO: validate types on ScalarResult and move to _typing
# Blocked from using by https://github.com/python/mypy/issues/1484
# See note at _mangle_lambda_list
ScalarResult = TypeVar("ScalarResult")
class NamedAgg(NamedTuple):
column: Hashable
aggfunc: AggScalar
def generate_property(name: str, klass: type[DataFrame | Series]):
"""
Create a property for a GroupBy subclass to dispatch to DataFrame/Series.
Parameters
----------
name : str
klass : {DataFrame, Series}
Returns
-------
property
"""
def prop(self):
return self._make_wrapper(name)
parent_method = getattr(klass, name)
prop.__doc__ = parent_method.__doc__ or ""
prop.__name__ = name
return property(prop)
def pin_allowlisted_properties(
klass: type[DataFrame | Series], allowlist: frozenset[str]
):
"""
Create GroupBy member defs for DataFrame/Series names in a allowlist.
Parameters
----------
klass : DataFrame or Series class
class where members are defined.
allowlist : frozenset[str]
Set of names of klass methods to be constructed
Returns
-------
class decorator
Notes
-----
Since we don't want to override methods explicitly defined in the
base class, any such name is skipped.
"""
def pinner(cls):
for name in allowlist:
if hasattr(cls, name):
# don't override anything that was explicitly defined
# in the base class
continue
prop = generate_property(name, klass)
setattr(cls, name, prop)
return cls
return pinner
@pin_allowlisted_properties(Series, base.series_apply_allowlist)
class SeriesGroupBy(GroupBy[Series]):
_apply_allowlist = base.series_apply_allowlist
def _wrap_agged_manager(self, mgr: Manager) -> Series:
if mgr.ndim == 1:
mgr = cast(SingleManager, mgr)
single = mgr
else:
mgr = cast(Manager2D, mgr)
single = mgr.iget(0)
ser = self.obj._constructor(single, name=self.obj.name)
# NB: caller is responsible for setting ser.index
return ser
def _get_data_to_aggregate(self) -> SingleManager:
ser = self._obj_with_exclusions
single = ser._mgr
return single
def _iterate_slices(self) -> Iterable[Series]:
yield self._selected_obj
_agg_examples_doc = dedent(
"""
Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
>>> s
0 1
1 2
2 3
3 4
dtype: int64
>>> s.groupby([1, 1, 2, 2]).min()
1 1
2 3
dtype: int64
>>> s.groupby([1, 1, 2, 2]).agg('min')
1 1
2 3
dtype: int64
>>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
min max
1 1 2
2 3 4
The output column names can be controlled by passing
the desired column names and aggregations as keyword arguments.
>>> s.groupby([1, 1, 2, 2]).agg(
... minimum='min',
... maximum='max',
... )
minimum maximum
1 1 2
2 3 4
.. versionchanged:: 1.3.0
The resulting dtype will reflect the return value of the aggregating function.
>>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
1 1.0
2 3.0
dtype: float64
"""
)
@Appender(
_apply_docs["template"].format(
input="series", examples=_apply_docs["series_examples"]
)
)
def apply(self, func, *args, **kwargs):
return super().apply(func, *args, **kwargs)
@doc(_agg_template, examples=_agg_examples_doc, klass="Series")
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with self._group_selection_context():
data = self._selected_obj
result = self._aggregate_with_numba(
data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
)
index = self.grouper.result_index
return self.obj._constructor(result.ravel(), index=index, name=data.name)
relabeling = func is None
columns = None
if relabeling:
columns, func = validate_func_kwargs(kwargs)
kwargs = {}
if isinstance(func, str):
return getattr(self, func)(*args, **kwargs)
elif isinstance(func, abc.Iterable):
# Catch instances of lists / tuples
# but not the class list / tuple itself.
func = maybe_mangle_lambdas(func)
ret = self._aggregate_multiple_funcs(func)
if relabeling:
# error: Incompatible types in assignment (expression has type
# "Optional[List[str]]", variable has type "Index")
ret.columns = columns # type: ignore[assignment]
return ret
else:
cyfunc = com.get_cython_func(func)
if cyfunc and not args and not kwargs:
return getattr(self, cyfunc)()
if self.grouper.nkeys > 1:
return self._python_agg_general(func, *args, **kwargs)
try:
return self._python_agg_general(func, *args, **kwargs)
except KeyError:
# TODO: KeyError is raised in _python_agg_general,
# see test_groupby.test_basic
result = self._aggregate_named(func, *args, **kwargs)
# result is a dict whose keys are the elements of result_index
index = self.grouper.result_index
return create_series_with_explicit_dtype(
result, index=index, dtype_if_empty=object
)
agg = aggregate
def _aggregate_multiple_funcs(self, arg) -> DataFrame:
if isinstance(arg, dict):
# show the deprecation, but only if we
# have not shown a higher level one
# GH 15931
raise SpecificationError("nested renamer is not supported")
elif any(isinstance(x, (tuple, list)) for x in arg):
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
# indicated column order
columns = next(zip(*arg))
else:
# list of functions / function names
columns = []
for f in arg:
columns.append(com.get_callable_name(f) or f)
arg = zip(columns, arg)
results: dict[base.OutputKey, DataFrame | Series] = {}
for idx, (name, func) in enumerate(arg):
key = base.OutputKey(label=name, position=idx)
results[key] = self.aggregate(func)
if any(isinstance(x, DataFrame) for x in results.values()):
from pandas import concat
res_df = concat(
results.values(), axis=1, keys=[key.label for key in results.keys()]
)
return res_df
indexed_output = {key.position: val for key, val in results.items()}
output = self.obj._constructor_expanddim(indexed_output, index=None)
output.columns = Index(key.label for key in results)
output = self._reindex_output(output)
return output
def _indexed_output_to_ndframe(
self, output: Mapping[base.OutputKey, ArrayLike]
) -> Series:
"""
Wrap the dict result of a GroupBy aggregation into a Series.
"""
assert len(output) == 1
values = next(iter(output.values()))
result = self.obj._constructor(values)
result.name = self.obj.name
return result
def _wrap_applied_output(
self,
data: Series,
values: list[Any],
not_indexed_same: bool = False,
) -> DataFrame | Series:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
Parameters
----------
data : Series
Input data for groupby operation.
values : List[Any]
Applied output for each group.
not_indexed_same : bool, default False
Whether the applied outputs are not indexed the same as the group axes.
Returns
-------
DataFrame or Series
"""
if len(values) == 0:
# GH #6265
return self.obj._constructor(
[],
name=self.obj.name,
index=self.grouper.result_index,
dtype=data.dtype,
)
assert values is not None
if isinstance(values[0], dict):
# GH #823 #24880
index = self.grouper.result_index
res_df = self.obj._constructor_expanddim(values, index=index)
res_df = self._reindex_output(res_df)
# if self.observed is False,
# keep all-NaN rows created while re-indexing
res_ser = res_df.stack(dropna=self.observed)
res_ser.name = self.obj.name
return res_ser
elif isinstance(values[0], (Series, DataFrame)):
return self._concat_objects(values, not_indexed_same=not_indexed_same)
else:
# GH #6265 #24880
result = self.obj._constructor(
data=values, index=self.grouper.result_index, name=self.obj.name
)
return self._reindex_output(result)
def _aggregate_named(self, func, *args, **kwargs):
# Note: this is very similar to _aggregate_series_pure_python,
# but that does not pin group.name
result = {}
initialized = False
for name, group in self:
object.__setattr__(group, "name", name)
output = func(group, *args, **kwargs)
output = libreduction.extract_result(output)
if not initialized:
# We only do this validation on the first iteration
libreduction.check_result_array(output, group.dtype)
initialized = True
result[name] = output
return result
@Substitution(klass="Series")
@Appender(_transform_template)
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
return self._transform(
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
):
assert axis == 0 # handled by caller
obj = self._selected_obj
try:
result = self.grouper._cython_operation(
"transform", obj._values, how, axis, **kwargs
)
except NotImplementedError as err:
raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
return obj._constructor(result, index=self.obj.index, name=obj.name)
def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
"""
Transform with a callable func`.
"""
assert callable(func)
klass = type(self.obj)
results = []
for name, group in self:
# this setattr is needed for test_transform_lambda_with_datetimetz
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
results.append(klass(res, index=group.index))
# check for empty "results" to avoid concat ValueError
if results:
from pandas.core.reshape.concat import concat
concatenated = concat(results)
result = self._set_result_index_ordered(concatenated)
else:
result = self.obj._constructor(dtype=np.float64)
result.name = self.obj.name
return result
def _can_use_transform_fast(self, result) -> bool:
return True
def filter(self, func, dropna: bool = True, *args, **kwargs):
"""
Return a copy of a Series excluding elements from groups that
do not satisfy the boolean criterion specified by func.
Parameters
----------
func : function
To apply to each group. Should return True or False.
dropna : Drop groups that do not pass the filter. True by default;
if False, groups that evaluate False are filled with NaNs.
Notes
-----
Functions that mutate the passed object can produce unexpected
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
for more details.
Examples
--------
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
... 'foo', 'bar'],
... 'B' : [1, 2, 3, 4, 5, 6],
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
>>> grouped = df.groupby('A')
>>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
1 2
3 4
5 6
Name: B, dtype: int64
Returns
-------
filtered : Series
"""
if isinstance(func, str):
wrapper = lambda x: getattr(x, func)(*args, **kwargs)
else:
wrapper = lambda x: func(x, *args, **kwargs)
# Interpret np.nan as False.
def true_and_notna(x) -> bool:
b = wrapper(x)
return b and notna(b)
try:
indices = [
self._get_index(name) for name, group in self if true_and_notna(group)
]
except (ValueError, TypeError) as err:
raise TypeError("the filter must return a boolean result") from err
filtered = self._apply_filter(indices, dropna)
return filtered
def nunique(self, dropna: bool = True) -> Series:
"""
Return number of unique elements in the group.
Returns
-------
Series
Number of unique values within each group.
"""
ids, _, _ = self.grouper.group_info
val = self.obj._values
codes, _ = algorithms.factorize(val, sort=False)
sorter = np.lexsort((codes, ids))
codes = codes[sorter]
ids = ids[sorter]
# group boundaries are where group ids change
# unique observations are where sorted values change
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
inc = np.r_[1, codes[1:] != codes[:-1]]
# 1st item of each group is a new unique observation
mask = codes == -1
if dropna:
inc[idx] = 1
inc[mask] = 0
else:
inc[mask & np.r_[False, mask[:-1]]] = 0
inc[idx] = 1
out = np.add.reduceat(inc, idx).astype("int64", copy=False)
if len(ids):
# NaN/NaT group exists if the head of ids is -1,
# so remove it from res and exclude its index from idx
if ids[0] == -1:
res = out[1:]
idx = idx[np.flatnonzero(idx)]
else:
res = out
else:
res = out[1:]
ri = self.grouper.result_index
# we might have duplications among the bins
if len(res) != len(ri):
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out
result = self.obj._constructor(res, index=ri, name=self.obj.name)
return self._reindex_output(result, fill_value=0)
@doc(Series.describe)
def describe(self, **kwargs):
return super().describe(**kwargs)
def value_counts(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins=None,
dropna: bool = True,
):
from pandas.core.reshape.merge import get_join_indexers
from pandas.core.reshape.tile import cut
ids, _, _ = self.grouper.group_info
val = self.obj._values
def apply_series_value_counts():
return self.apply(
Series.value_counts,
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
)
if bins is not None:
if not np.iterable(bins):
# scalar bins cannot be done at top level
# in a backward compatible way
return apply_series_value_counts()
elif is_categorical_dtype(val.dtype):
# GH38672
return apply_series_value_counts()
# groupby removes null keys from groupings
mask = ids != -1
ids, val = ids[mask], val[mask]
if bins is None:
lab, lev = algorithms.factorize(val, sort=True)
llab = lambda lab, inc: lab[inc]
else:
# lab is a Categorical with categories an IntervalIndex
lab = cut(Series(val), bins, include_lowest=True)
# error: "ndarray" has no attribute "cat"
lev = lab.cat.categories # type: ignore[attr-defined]
# error: No overload variant of "take" of "_ArrayOrScalarCommon" matches
# argument types "Any", "bool", "Union[Any, float]"
lab = lev.take( # type: ignore[call-overload]
# error: "ndarray" has no attribute "cat"
lab.cat.codes, # type: ignore[attr-defined]
allow_fill=True,
# error: Item "ndarray" of "Union[ndarray, Index]" has no attribute
# "_na_value"
fill_value=lev._na_value, # type: ignore[union-attr]
)
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
if is_interval_dtype(lab.dtype):
# TODO: should we do this inside II?
# error: "ndarray" has no attribute "left"
# error: "ndarray" has no attribute "right"
sorter = np.lexsort(
(lab.left, lab.right, ids) # type: ignore[attr-defined]
)
else:
sorter = np.lexsort((lab, ids))
ids, lab = ids[sorter], lab[sorter]
# group boundaries are where group ids change
idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
idx = np.r_[0, idchanges]
if not len(ids):
idx = idchanges
# new values are where sorted labels change
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
inc = np.r_[True, lchanges]
if not len(val):
inc = lchanges
inc[idx] = True # group boundaries are also new values
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
# num. of times each group should be repeated
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
# multi-index components
codes = self.grouper.reconstructed_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
# error: List item 0 has incompatible type "Union[ndarray[Any, Any], Index]";
# expected "Index"
levels = [ping.group_index for ping in self.grouper.groupings] + [
lev # type: ignore[list-item]
]
names = self.grouper.names + [self.obj.name]
if dropna:
mask = codes[-1] != -1
if mask.all():
dropna = False
else:
out, codes = out[mask], [level_codes[mask] for level_codes in codes]
if normalize:
out = out.astype("float")
d = np.diff(np.r_[idx, len(ids)])
if dropna:
m = ids[lab == -1]
np.add.at(d, m, -1)
acc = rep(d)[mask]
else:
acc = rep(d)
out /= acc
if sort and bins is None:
cat = ids[inc][mask] if dropna else ids[inc]
sorter = np.lexsort((out if ascending else -out, cat))
out, codes[-1] = out[sorter], codes[-1][sorter]
if bins is not None:
# for compat. with libgroupby.value_counts need to ensure every
# bin is present at every index level, null filled with zeros
diff = np.zeros(len(out), dtype="bool")
for level_codes in codes[:-1]:
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
ncat, nbin = diff.sum(), len(levels[-1])
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
right = [diff.cumsum() - 1, codes[-1]]
_, idx = get_join_indexers(left, right, sort=False, how="left")
out = np.where(idx != -1, out[idx], 0)
if sort:
sorter = np.lexsort((out if ascending else -out, left[0]))
out, left[-1] = out[sorter], left[-1][sorter]
# build the multi-index w/ full levels
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
return np.repeat(lev_codes[diff], nbin)
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
codes.append(left[-1])
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
if is_integer_dtype(out.dtype):
out = ensure_int64(out)
return self.obj._constructor(out, index=mi, name=self.obj.name)
@doc(Series.nlargest)
def nlargest(self, n: int = 5, keep: str = "first"):
f = partial(Series.nlargest, n=n, keep=keep)
data = self._obj_with_exclusions
# Don't change behavior if result index happens to be the same, i.e.
# already ordered and n >= all group sizes.
result = self._python_apply_general(f, data, not_indexed_same=True)
return result
@doc(Series.nsmallest)
def nsmallest(self, n: int = 5, keep: str = "first"):
f = partial(Series.nsmallest, n=n, keep=keep)
data = self._obj_with_exclusions
# Don't change behavior if result index happens to be the same, i.e.
# already ordered and n >= all group sizes.
result = self._python_apply_general(f, data, not_indexed_same=True)
return result
@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)
class DataFrameGroupBy(GroupBy[DataFrame]):
_apply_allowlist = base.dataframe_apply_allowlist
_agg_examples_doc = dedent(
"""
Examples
--------
>>> df = pd.DataFrame(
... {
... "A": [1, 1, 2, 2],
... "B": [1, 2, 3, 4],
... "C": [0.362838, 0.227877, 1.267767, -0.562860],
... }
... )
>>> df
A B C
0 1 1 0.362838
1 1 2 0.227877
2 2 3 1.267767
3 2 4 -0.562860
The aggregation is for each column.
>>> df.groupby('A').agg('min')
B C
A
1 1 0.227877
2 3 -0.562860
Multiple aggregations
>>> df.groupby('A').agg(['min', 'max'])
B C
min max min max
A
1 1 2 0.227877 0.362838
2 3 4 -0.562860 1.267767
Select a column for aggregation
>>> df.groupby('A').B.agg(['min', 'max'])
min max
A
1 1 2
2 3 4
Different aggregations per column
>>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
B C
min max sum
A
1 1 2 0.590715
2 3 4 0.704907
To control the output names with different aggregations per column,
pandas supports "named aggregation"
>>> df.groupby("A").agg(
... b_min=pd.NamedAgg(column="B", aggfunc="min"),
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
b_min c_sum
A
1 1 0.590715
2 3 0.704907
- The keywords are the *output* column names
- The values are tuples whose first element is the column to select
and the second element is the aggregation to apply to that column.
Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
``['column', 'aggfunc']`` to make it clearer what the arguments are.
As usual, the aggregation can be a callable or a string alias.
See :ref:`groupby.aggregate.named` for more.
.. versionchanged:: 1.3.0
The resulting dtype will reflect the return value of the aggregating function.
>>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
B
A
1 1.0
2 3.0
"""
)
@doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with self._group_selection_context():
data = self._selected_obj
result = self._aggregate_with_numba(
data, func, *args, engine_kwargs=engine_kwargs, **kwargs
)
index = self.grouper.result_index
return self.obj._constructor(result, index=index, columns=data.columns)
relabeling, func, columns, order = reconstruct_func(func, **kwargs)
func = maybe_mangle_lambdas(func)
op = GroupByApply(self, func, args, kwargs)
result = op.agg()
if not is_dict_like(func) and result is not None:
return result
elif relabeling and result is not None:
# this should be the only (non-raising) case with relabeling
# used reordered index of columns
result = result.iloc[:, order]
result.columns = columns
if result is None:
# grouper specific aggregations
if self.grouper.nkeys > 1:
# test_groupby_as_index_series_scalar gets here with 'not self.as_index'
return self._python_agg_general(func, *args, **kwargs)
elif args or kwargs:
# test_pass_args_kwargs gets here (with and without as_index)
# can't return early
result = self._aggregate_frame(func, *args, **kwargs)
elif self.axis == 1:
# _aggregate_multiple_funcs does not allow self.axis == 1
# Note: axis == 1 precludes 'not self.as_index', see __init__
result = self._aggregate_frame(func)
return result
else:
# try to treat as if we are passing a list
gba = GroupByApply(self, [func], args=(), kwargs={})
try:
result = gba.agg()
except ValueError as err:
if "no results" not in str(err):
# raised directly by _aggregate_multiple_funcs
raise
result = self._aggregate_frame(func)
else:
sobj = self._selected_obj
if isinstance(sobj, Series):
# GH#35246 test_groupby_as_index_select_column_sum_empty_df
result.columns = self._obj_with_exclusions.columns.copy()
else:
# Retain our column names
result.columns._set_names(
sobj.columns.names, level=list(range(sobj.columns.nlevels))
)
# select everything except for the last level, which is the one
# containing the name of the function(s), see GH#32040
result.columns = result.columns.droplevel(-1)
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
result.index = Index(range(len(result)))
return result
agg = aggregate
def _iterate_slices(self) -> Iterable[Series]:
obj = self._selected_obj
if self.axis == 1:
obj = obj.T
if isinstance(obj, Series) and obj.name not in self.exclusions:
# Occurs when doing DataFrameGroupBy(...)["X"]
yield obj
else:
for label, values in obj.items():
if label in self.exclusions:
continue
yield values
def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
if self.grouper.nkeys != 1:
raise AssertionError("Number of keys must be 1")
obj = self._obj_with_exclusions
result: dict[Hashable, NDFrame | np.ndarray] = {}
if self.axis == 0:
# test_pass_args_kwargs_duplicate_columns gets here with non-unique columns
for name, data in self:
fres = func(data, *args, **kwargs)
result[name] = fres
else:
# we get here in a number of test_multilevel tests
for name in self.indices:
grp_df = self.get_group(name, obj=obj)
fres = func(grp_df, *args, **kwargs)
result[name] = fres
result_index = self.grouper.result_index
other_ax = obj.axes[1 - self.axis]
out = self.obj._constructor(result, index=other_ax, columns=result_index)
if self.axis == 0:
out = out.T
return out
def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
# only for axis==0
# tests that get here with non-unique cols:
# test_resample_with_timedelta_yields_no_empty_groups,
# test_resample_apply_product
obj = self._obj_with_exclusions
result: dict[int, NDFrame] = {}
for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):
result[i] = sgb.aggregate(func, *args, **kwargs)
res_df = self.obj._constructor(result)
res_df.columns = obj.columns
return res_df
def _wrap_applied_output(
self, data: DataFrame, values: list, not_indexed_same: bool = False
):
if len(values) == 0:
result = self.obj._constructor(
index=self.grouper.result_index, columns=data.columns
)
result = result.astype(data.dtypes, copy=False)
return result
# GH12824
first_not_none = next(com.not_none(*values), None)
if first_not_none is None:
# GH9684 - All values are None, return an empty frame.
return self.obj._constructor()
elif isinstance(first_not_none, DataFrame):
return self._concat_objects(values, not_indexed_same=not_indexed_same)
key_index = self.grouper.result_index if self.as_index else None
if isinstance(first_not_none, (np.ndarray, Index)):
# GH#1738: values is list of arrays of unequal lengths
# fall through to the outer else clause
# TODO: sure this is right? we used to do this
# after raising AttributeError above
return self.obj._constructor_sliced(
values, index=key_index, name=self._selection
)
elif not isinstance(first_not_none, Series):
# values are not series or array-like but scalars
# self._selection not passed through to Series as the
# result should not take the name of original selection
# of columns
if self.as_index:
return self.obj._constructor_sliced(values, index=key_index)
else:
result = self.obj._constructor(values, columns=[self._selection])
self._insert_inaxis_grouper_inplace(result)
return result
else:
# values are Series
return self._wrap_applied_output_series(
values, not_indexed_same, first_not_none, key_index
)
def _wrap_applied_output_series(
self,
values: list[Series],
not_indexed_same: bool,
first_not_none,
key_index,
) -> DataFrame | Series:
# this is to silence a DeprecationWarning
# TODO(2.0): Remove when default dtype of empty Series is object
kwargs = first_not_none._construct_axes_dict()
backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs)
values = [x if (x is not None) else backup for x in values]
all_indexed_same = all_indexes_same(x.index for x in values)
# GH3596
# provide a reduction (Frame -> Series) if groups are
# unique
if self.squeeze:
applied_index = self._selected_obj._get_axis(self.axis)
singular_series = len(values) == 1 and applied_index.nlevels == 1
if singular_series:
# GH2893
# we have series in the values array, we want to
# produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a
# single values
return self._concat_objects(values, not_indexed_same=not_indexed_same)
# still a series
# path added as of GH 5545
elif all_indexed_same:
from pandas.core.reshape.concat import concat
return concat(values)
if not all_indexed_same:
# GH 8467
return self._concat_objects(values, not_indexed_same=True)
# Combine values
# vstack+constructor is faster than concat and handles MI-columns
stacked_values = np.vstack([np.asarray(v) for v in values])
if self.axis == 0:
index = key_index
columns = first_not_none.index.copy()
if columns.name is None:
# GH6124 - propagate name of Series when it's consistent
names = {v.name for v in values}
if len(names) == 1:
columns.name = list(names)[0]
else:
index = first_not_none.index
columns = key_index
stacked_values = stacked_values.T
if stacked_values.dtype == object:
# We'll have the DataFrame constructor do inference
stacked_values = stacked_values.tolist()
result = self.obj._constructor(stacked_values, index=index, columns=columns)
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
return self._reindex_output(result)
def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
) -> DataFrame:
assert axis == 0 # handled by caller
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
# With self.axis == 0, we have multi-block tests
# e.g. test_rank_min_int, test_cython_transform_frame
# test_transform_numeric_ret
# With self.axis == 1, _get_data_to_aggregate does a transpose
# so we always have a single block.
mgr: Manager2D = self._get_data_to_aggregate()
if numeric_only:
mgr = mgr.get_numeric_data(copy=False)
def arr_func(bvalues: ArrayLike) -> ArrayLike:
return self.grouper._cython_operation(
"transform", bvalues, how, 1, **kwargs
)
# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])
if len(res_mgr) < len(mgr):
warn_dropping_nuisance_columns_deprecated(type(self), how)
res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
res_df = res_df.T
return res_df
def _transform_general(self, func, *args, **kwargs):
from pandas.core.reshape.concat import concat
applied = []
obj = self._obj_with_exclusions
gen = self.grouper.get_iterator(obj, axis=self.axis)
fast_path, slow_path = self._define_paths(func, *args, **kwargs)
# Determine whether to use slow or fast path by evaluating on the first group.
# Need to handle the case of an empty generator and process the result so that
# it does not need to be computed again.
try:
name, group = next(gen)
except StopIteration:
pass
else:
object.__setattr__(group, "name", name)
try:
path, res = self._choose_path(fast_path, slow_path, group)
except TypeError:
return self._transform_item_by_item(obj, fast_path)
except ValueError as err:
msg = "transform must return a scalar value for each group"
raise ValueError(msg) from err
if group.size > 0:
res = _wrap_transform_general_frame(self.obj, group, res)
applied.append(res)
# Compute and process with the remaining groups
for name, group in gen:
if group.size == 0:
continue
object.__setattr__(group, "name", name)
res = path(group)
res = _wrap_transform_general_frame(self.obj, group, res)
applied.append(res)
concat_index = obj.columns if self.axis == 0 else obj.index
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
concatenated = concat(applied, axis=self.axis, verify_integrity=False)
concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
return self._set_result_index_ordered(concatenated)
@Substitution(klass="DataFrame")
@Appender(_transform_template)
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
return self._transform(
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
def _can_use_transform_fast(self, result) -> bool:
return isinstance(result, DataFrame) and result.columns.equals(
self._obj_with_exclusions.columns
)
def _define_paths(self, func, *args, **kwargs):
if isinstance(func, str):
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
slow_path = lambda group: group.apply(
lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
)
else:
fast_path = lambda group: func(group, *args, **kwargs)
slow_path = lambda group: group.apply(
lambda x: func(x, *args, **kwargs), axis=self.axis
)
return fast_path, slow_path
def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
path = slow_path
res = slow_path(group)
# if we make it here, test if we can use the fast path
try:
res_fast = fast_path(group)
except AssertionError:
raise # pragma: no cover
except Exception:
# GH#29631 For user-defined function, we can't predict what may be
# raised; see test_transform.test_transform_fastpath_raises
return path, res
# verify fast path does not change columns (and names), otherwise
# its results cannot be joined with those of the slow path
if not isinstance(res_fast, DataFrame):
return path, res
if not res_fast.columns.equals(group.columns):
return path, res
if res_fast.equals(res):
path = fast_path
return path, res
def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
# iterate through columns, see test_transform_exclude_nuisance
# gets here with non-unique columns
output = {}
inds = []
for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
try:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
warn_dropping_nuisance_columns_deprecated(type(self), "transform")
else:
inds.append(i)
if not output:
raise TypeError("Transform function invalid for data types")
columns = obj.columns.take(inds)
result = self.obj._constructor(output, index=obj.index)
result.columns = columns
return result
def filter(self, func, dropna=True, *args, **kwargs):
"""
Return a copy of a DataFrame excluding filtered elements.
Elements from groups are filtered if they do not satisfy the
boolean criterion specified by func.
Parameters
----------
func : function
Function to apply to each subframe. Should return True or False.
dropna : Drop groups that do not pass the filter. True by default;
If False, groups that evaluate False are filled with NaNs.
Returns
-------
filtered : DataFrame
Notes
-----
Each subframe is endowed the attribute 'name' in case you need to know
which group you are working on.
Functions that mutate the passed object can produce unexpected
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
for more details.
Examples
--------
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
... 'foo', 'bar'],
... 'B' : [1, 2, 3, 4, 5, 6],
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
>>> grouped = df.groupby('A')
>>> grouped.filter(lambda x: x['B'].mean() > 3.)
A B C
1 bar 2 5.0
3 bar 4 1.0
5 bar 6 9.0
"""
indices = []
obj = self._selected_obj
gen = self.grouper.get_iterator(obj, axis=self.axis)
for name, group in gen:
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
try:
res = res.squeeze()
except AttributeError: # allow e.g., scalars and frames to pass
pass
# interpret the result of the filter
if is_bool(res) or (is_scalar(res) and isna(res)):
if res and notna(res):
indices.append(self._get_index(name))
else:
# non scalars aren't allowed
raise TypeError(
f"filter function returned a {type(res).__name__}, "
"but expected a scalar bool"
)
return self._apply_filter(indices, dropna)
def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
if self.axis == 1:
# GH 37725
raise ValueError("Cannot subset columns when using axis=1")
# per GH 23566
if isinstance(key, tuple) and len(key) > 1:
# if len == 1, then it becomes a SeriesGroupBy and this is actually
# valid syntax, so don't raise warning
warnings.warn(
"Indexing with multiple keys (implicitly converted to a tuple "
"of keys) will be deprecated, use a list instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
return super().__getitem__(key)
def _gotitem(self, key, ndim: int, subset=None):
"""
sub-classes to define
return a sliced object
Parameters
----------
key : string / list of selections
ndim : {1, 2}
requested ndim of result
subset : object, default None
subset to act on
"""
if ndim == 2:
if subset is None:
subset = self.obj
return DataFrameGroupBy(
subset,
self.grouper,
axis=self.axis,
level=self.level,
grouper=self.grouper,
exclusions=self.exclusions,
selection=key,
as_index=self.as_index,
sort=self.sort,
group_keys=self.group_keys,
squeeze=self.squeeze,
observed=self.observed,
mutated=self.mutated,
dropna=self.dropna,
)
elif ndim == 1:
if subset is None:
subset = self.obj[key]
return SeriesGroupBy(
subset,
level=self.level,
grouper=self.grouper,
selection=key,
sort=self.sort,
group_keys=self.group_keys,
squeeze=self.squeeze,
observed=self.observed,
dropna=self.dropna,
)
raise AssertionError("invalid ndim for _gotitem")
def _get_data_to_aggregate(self) -> Manager2D:
obj = self._obj_with_exclusions
if self.axis == 1:
return obj.T._mgr
else:
return obj._mgr
def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
# zip in reverse so we can always insert at loc 0
columns = result.columns
for name, lev, in_axis in zip(
reversed(self.grouper.names),
reversed(self.grouper.get_group_levels()),
reversed([grp.in_axis for grp in self.grouper.groupings]),
):
# GH #28549
# When using .apply(-), name will be in columns already
if in_axis and name not in columns:
result.insert(0, name, lev)
def _indexed_output_to_ndframe(
self, output: Mapping[base.OutputKey, ArrayLike]
) -> DataFrame:
"""
Wrap the dict result of a GroupBy aggregation into a DataFrame.
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index([key.label for key in output])
columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
result = self.obj._constructor(indexed_output)
result.columns = columns
return result
def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
if not self.as_index:
# GH 41998 - empty mgr always gets index of length 0
rows = mgr.shape[1] if mgr.shape[0] > 0 else 0
index = Index(range(rows))
mgr.set_axis(1, index)
result = self.obj._constructor(mgr)
self._insert_inaxis_grouper_inplace(result)
result = result._consolidate()
else:
index = self.grouper.result_index
mgr.set_axis(1, index)
result = self.obj._constructor(mgr)
if self.axis == 1:
result = result.T
# Note: we only need to pass datetime=True in order to get numeric
# values converted
return self._reindex_output(result)._convert(datetime=True)
def _iterate_column_groupbys(self, obj: DataFrame | Series):
for i, colname in enumerate(obj.columns):
yield colname, SeriesGroupBy(
obj.iloc[:, i],
selection=colname,
grouper=self.grouper,
exclusions=self.exclusions,
observed=self.observed,
)
def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame:
from pandas.core.reshape.concat import concat
columns = obj.columns
results = [
func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
]
if not len(results):
# concat would raise
return DataFrame([], columns=columns, index=self.grouper.result_index)
else:
return concat(results, keys=columns, axis=1)
def nunique(self, dropna: bool = True) -> DataFrame:
"""
Return DataFrame with counts of unique elements in each position.
Parameters
----------
dropna : bool, default True
Don't include NaN in the counts.
Returns
-------
nunique: DataFrame
Examples
--------
>>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
... 'ham', 'ham'],
... 'value1': [1, 5, 5, 2, 5, 5],
... 'value2': list('abbaxy')})
>>> df
id value1 value2
0 spam 1 a
1 egg 5 b
2 egg 5 b
3 spam 2 a
4 ham 5 x
5 ham 5 y
>>> df.groupby('id').nunique()
value1 value2
id
egg 1 1
ham 1 2
spam 2 1
Check for rows with the same id but conflicting values:
>>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
id value1 value2
0 spam 1 a
3 spam 2 a
4 ham 5 x
5 ham 5 y
"""
if self.axis != 0:
# see test_groupby_crash_on_nunique
return self._python_agg_general(lambda sgb: sgb.nunique(dropna))
obj = self._obj_with_exclusions
results = self._apply_to_column_groupbys(
lambda sgb: sgb.nunique(dropna), obj=obj
)
if not self.as_index:
results.index = Index(range(len(results)))
self._insert_inaxis_grouper_inplace(results)
return results
@Appender(DataFrame.idxmax.__doc__)
def idxmax(self, axis=0, skipna: bool = True):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
func.__name__ = "idxmax"
return self._python_apply_general(func, self._obj_with_exclusions)
@Appender(DataFrame.idxmin.__doc__)
def idxmin(self, axis=0, skipna: bool = True):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
func.__name__ = "idxmin"
return self._python_apply_general(func, self._obj_with_exclusions)
boxplot = boxplot_frame_groupby
def value_counts(
self,
subset: Sequence[Hashable] | None = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
) -> DataFrame | Series:
"""
Return a Series or DataFrame containing counts of unique rows.
.. versionadded:: 1.4.0
Parameters
----------
subset : list-like, optional
Columns to use when counting unique combinations.
normalize : bool, default False
Return proportions rather than frequencies.
sort : bool, default True
Sort by frequencies.
ascending : bool, default False
Sort in ascending order.
dropna : bool, default True
Dont include counts of rows that contain NA values.
Returns
-------
Series or DataFrame
Series if the groupby as_index is True, otherwise DataFrame.
See Also
--------
Series.value_counts: Equivalent method on Series.
DataFrame.value_counts: Equivalent method on DataFrame.
SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
Notes
-----
- If the groupby as_index is True then the returned Series will have a
MultiIndex with one level per input column.
- If the groupby as_index is False then the returned DataFrame will have an
additional column with the value_counts. The column is labelled 'count' or
'proportion', depending on the ``normalize`` parameter.
By default, rows that contain any NA values are omitted from
the result.
By default, the result will be in descending order so that the
first element of each group is the most frequently-occurring row.
Examples
--------
>>> df = pd.DataFrame({
... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
... })
>>> df
gender education country
0 male low US
1 male medium FR
2 female high US
3 male low FR
4 female high FR
5 male low FR
>>> df.groupby('gender').value_counts()
gender education country
female high FR 1
US 1
male low FR 2
US 1
medium FR 1
dtype: int64
>>> df.groupby('gender').value_counts(ascending=True)
gender education country
female high FR 1
US 1
male low US 1
medium FR 1
low FR 2
dtype: int64
>>> df.groupby('gender').value_counts(normalize=True)
gender education country
female high FR 0.50
US 0.50
male low FR 0.50
US 0.25
medium FR 0.25
dtype: float64
>>> df.groupby('gender', as_index=False).value_counts()
gender education country count
0 female high FR 1
1 female high US 1
2 male low FR 2
3 male low US 1
4 male medium FR 1
>>> df.groupby('gender', as_index=False).value_counts(normalize=True)
gender education country proportion
0 female high FR 0.50
1 female high US 0.50
2 male low FR 0.50
3 male low US 0.25
4 male medium FR 0.25
"""
if self.axis == 1:
raise NotImplementedError(
"DataFrameGroupBy.value_counts only handles axis=0"
)
with self._group_selection_context():
df = self.obj
in_axis_names = {
grouping.name for grouping in self.grouper.groupings if grouping.in_axis
}
if isinstance(self._selected_obj, Series):
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
else:
unique_cols = set(self._selected_obj.columns)
if subset is not None:
subsetted = set(subset)
clashing = subsetted & set(in_axis_names)
if clashing:
raise ValueError(
f"Keys {clashing} in subset cannot be in "
"the groupby column keys."
)
doesnt_exist = subsetted - unique_cols
if doesnt_exist:
raise ValueError(
f"Keys {doesnt_exist} in subset do not "
f"exist in the DataFrame."
)
else:
subsetted = unique_cols
keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names and name in subsetted
]
groupings = list(self.grouper.groupings)
for key in keys:
grouper, _, _ = get_grouper(
df,
key=key,
axis=self.axis,
sort=self.sort,
dropna=dropna,
)
groupings += list(grouper.groupings)
# Take the size of the overall columns
gb = df.groupby(
groupings,
sort=self.sort,
observed=self.observed,
dropna=self.dropna,
)
result = cast(Series, gb.size())
if normalize:
# Normalize the results by dividing by the original group sizes.
# We are guaranteed to have the first N levels be the
# user-requested grouping.
levels = list(range(len(self.grouper.groupings), result.index.nlevels))
indexed_group_size = result.groupby(
result.index.droplevel(levels),
sort=self.sort,
observed=self.observed,
dropna=self.dropna,
).transform("sum")
result /= indexed_group_size
if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result = result.sort_values(ascending=ascending).sort_index(
level=index_level, sort_remaining=False
)
if not self.as_index:
# Convert to frame
result = result.reset_index(name="proportion" if normalize else "count")
return result.__finalize__(self.obj, method="value_counts")
def _wrap_transform_general_frame(
obj: DataFrame, group: DataFrame, res: DataFrame | Series
) -> DataFrame:
from pandas import concat
if isinstance(res, Series):
# we need to broadcast across the
# other dimension; this will preserve dtypes
# GH14457
if res.index.is_(obj.index):
res_frame = concat([res] * len(group.columns), axis=1)
res_frame.columns = group.columns
res_frame.index = group.index
else:
res_frame = obj._constructor(
np.concatenate([res.values] * len(group.index)).reshape(group.shape),
columns=group.columns,
index=group.index,
)
assert isinstance(res_frame, DataFrame)
return res_frame
else:
return res