1796 lines
		
	
	
		
			60 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1796 lines
		
	
	
		
			60 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						||
Define the SeriesGroupBy and DataFrameGroupBy
 | 
						||
classes that hold the groupby interfaces (and some implementations).
 | 
						||
 | 
						||
These are user facing as the result of the ``df.groupby(...)`` operations,
 | 
						||
which here returns a DataFrameGroupBy object.
 | 
						||
"""
 | 
						||
from __future__ import annotations
 | 
						||
 | 
						||
from collections import abc
 | 
						||
from functools import partial
 | 
						||
from textwrap import dedent
 | 
						||
from typing import (
 | 
						||
    Any,
 | 
						||
    Callable,
 | 
						||
    Hashable,
 | 
						||
    Iterable,
 | 
						||
    Mapping,
 | 
						||
    NamedTuple,
 | 
						||
    Sequence,
 | 
						||
    TypeVar,
 | 
						||
    Union,
 | 
						||
    cast,
 | 
						||
)
 | 
						||
import warnings
 | 
						||
 | 
						||
import numpy as np
 | 
						||
 | 
						||
from pandas._libs import reduction as libreduction
 | 
						||
from pandas._typing import (
 | 
						||
    ArrayLike,
 | 
						||
    Manager,
 | 
						||
    Manager2D,
 | 
						||
    SingleManager,
 | 
						||
)
 | 
						||
from pandas.util._decorators import (
 | 
						||
    Appender,
 | 
						||
    Substitution,
 | 
						||
    doc,
 | 
						||
)
 | 
						||
from pandas.util._exceptions import find_stack_level
 | 
						||
 | 
						||
from pandas.core.dtypes.common import (
 | 
						||
    ensure_int64,
 | 
						||
    is_bool,
 | 
						||
    is_categorical_dtype,
 | 
						||
    is_dict_like,
 | 
						||
    is_integer_dtype,
 | 
						||
    is_interval_dtype,
 | 
						||
    is_scalar,
 | 
						||
)
 | 
						||
from pandas.core.dtypes.missing import (
 | 
						||
    isna,
 | 
						||
    notna,
 | 
						||
)
 | 
						||
 | 
						||
from pandas.core import (
 | 
						||
    algorithms,
 | 
						||
    nanops,
 | 
						||
)
 | 
						||
from pandas.core.apply import (
 | 
						||
    GroupByApply,
 | 
						||
    maybe_mangle_lambdas,
 | 
						||
    reconstruct_func,
 | 
						||
    validate_func_kwargs,
 | 
						||
)
 | 
						||
from pandas.core.base import SpecificationError
 | 
						||
import pandas.core.common as com
 | 
						||
from pandas.core.construction import create_series_with_explicit_dtype
 | 
						||
from pandas.core.frame import DataFrame
 | 
						||
from pandas.core.generic import NDFrame
 | 
						||
from pandas.core.groupby import base
 | 
						||
from pandas.core.groupby.groupby import (
 | 
						||
    GroupBy,
 | 
						||
    _agg_template,
 | 
						||
    _apply_docs,
 | 
						||
    _transform_template,
 | 
						||
    warn_dropping_nuisance_columns_deprecated,
 | 
						||
)
 | 
						||
from pandas.core.groupby.grouper import get_grouper
 | 
						||
from pandas.core.indexes.api import (
 | 
						||
    Index,
 | 
						||
    MultiIndex,
 | 
						||
    all_indexes_same,
 | 
						||
)
 | 
						||
from pandas.core.series import Series
 | 
						||
from pandas.core.util.numba_ import maybe_use_numba
 | 
						||
 | 
						||
from pandas.plotting import boxplot_frame_groupby
 | 
						||
 | 
						||
# TODO(typing) the return value on this callable should be any *scalar*.
 | 
						||
AggScalar = Union[str, Callable[..., Any]]
 | 
						||
# TODO: validate types on ScalarResult and move to _typing
 | 
						||
# Blocked from using by https://github.com/python/mypy/issues/1484
 | 
						||
# See note at _mangle_lambda_list
 | 
						||
ScalarResult = TypeVar("ScalarResult")
 | 
						||
 | 
						||
 | 
						||
class NamedAgg(NamedTuple):
 | 
						||
    column: Hashable
 | 
						||
    aggfunc: AggScalar
 | 
						||
 | 
						||
 | 
						||
def generate_property(name: str, klass: type[DataFrame | Series]):
 | 
						||
    """
 | 
						||
    Create a property for a GroupBy subclass to dispatch to DataFrame/Series.
 | 
						||
 | 
						||
    Parameters
 | 
						||
    ----------
 | 
						||
    name : str
 | 
						||
    klass : {DataFrame, Series}
 | 
						||
 | 
						||
    Returns
 | 
						||
    -------
 | 
						||
    property
 | 
						||
    """
 | 
						||
 | 
						||
    def prop(self):
 | 
						||
        return self._make_wrapper(name)
 | 
						||
 | 
						||
    parent_method = getattr(klass, name)
 | 
						||
    prop.__doc__ = parent_method.__doc__ or ""
 | 
						||
    prop.__name__ = name
 | 
						||
    return property(prop)
 | 
						||
 | 
						||
 | 
						||
def pin_allowlisted_properties(
 | 
						||
    klass: type[DataFrame | Series], allowlist: frozenset[str]
 | 
						||
):
 | 
						||
    """
 | 
						||
    Create GroupBy member defs for DataFrame/Series names in a allowlist.
 | 
						||
 | 
						||
    Parameters
 | 
						||
    ----------
 | 
						||
    klass : DataFrame or Series class
 | 
						||
        class where members are defined.
 | 
						||
    allowlist : frozenset[str]
 | 
						||
        Set of names of klass methods to be constructed
 | 
						||
 | 
						||
    Returns
 | 
						||
    -------
 | 
						||
    class decorator
 | 
						||
 | 
						||
    Notes
 | 
						||
    -----
 | 
						||
    Since we don't want to override methods explicitly defined in the
 | 
						||
    base class, any such name is skipped.
 | 
						||
    """
 | 
						||
 | 
						||
    def pinner(cls):
 | 
						||
        for name in allowlist:
 | 
						||
            if hasattr(cls, name):
 | 
						||
                # don't override anything that was explicitly defined
 | 
						||
                #  in the base class
 | 
						||
                continue
 | 
						||
 | 
						||
            prop = generate_property(name, klass)
 | 
						||
            setattr(cls, name, prop)
 | 
						||
 | 
						||
        return cls
 | 
						||
 | 
						||
    return pinner
 | 
						||
 | 
						||
 | 
						||
@pin_allowlisted_properties(Series, base.series_apply_allowlist)
 | 
						||
class SeriesGroupBy(GroupBy[Series]):
 | 
						||
    _apply_allowlist = base.series_apply_allowlist
 | 
						||
 | 
						||
    def _wrap_agged_manager(self, mgr: Manager) -> Series:
 | 
						||
        if mgr.ndim == 1:
 | 
						||
            mgr = cast(SingleManager, mgr)
 | 
						||
            single = mgr
 | 
						||
        else:
 | 
						||
            mgr = cast(Manager2D, mgr)
 | 
						||
            single = mgr.iget(0)
 | 
						||
        ser = self.obj._constructor(single, name=self.obj.name)
 | 
						||
        # NB: caller is responsible for setting ser.index
 | 
						||
        return ser
 | 
						||
 | 
						||
    def _get_data_to_aggregate(self) -> SingleManager:
 | 
						||
        ser = self._obj_with_exclusions
 | 
						||
        single = ser._mgr
 | 
						||
        return single
 | 
						||
 | 
						||
    def _iterate_slices(self) -> Iterable[Series]:
 | 
						||
        yield self._selected_obj
 | 
						||
 | 
						||
    _agg_examples_doc = dedent(
 | 
						||
        """
 | 
						||
    Examples
 | 
						||
    --------
 | 
						||
    >>> s = pd.Series([1, 2, 3, 4])
 | 
						||
 | 
						||
    >>> s
 | 
						||
    0    1
 | 
						||
    1    2
 | 
						||
    2    3
 | 
						||
    3    4
 | 
						||
    dtype: int64
 | 
						||
 | 
						||
    >>> s.groupby([1, 1, 2, 2]).min()
 | 
						||
    1    1
 | 
						||
    2    3
 | 
						||
    dtype: int64
 | 
						||
 | 
						||
    >>> s.groupby([1, 1, 2, 2]).agg('min')
 | 
						||
    1    1
 | 
						||
    2    3
 | 
						||
    dtype: int64
 | 
						||
 | 
						||
    >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
 | 
						||
       min  max
 | 
						||
    1    1    2
 | 
						||
    2    3    4
 | 
						||
 | 
						||
    The output column names can be controlled by passing
 | 
						||
    the desired column names and aggregations as keyword arguments.
 | 
						||
 | 
						||
    >>> s.groupby([1, 1, 2, 2]).agg(
 | 
						||
    ...     minimum='min',
 | 
						||
    ...     maximum='max',
 | 
						||
    ... )
 | 
						||
       minimum  maximum
 | 
						||
    1        1        2
 | 
						||
    2        3        4
 | 
						||
 | 
						||
    .. versionchanged:: 1.3.0
 | 
						||
 | 
						||
        The resulting dtype will reflect the return value of the aggregating function.
 | 
						||
 | 
						||
    >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
 | 
						||
    1    1.0
 | 
						||
    2    3.0
 | 
						||
    dtype: float64
 | 
						||
    """
 | 
						||
    )
 | 
						||
 | 
						||
    @Appender(
 | 
						||
        _apply_docs["template"].format(
 | 
						||
            input="series", examples=_apply_docs["series_examples"]
 | 
						||
        )
 | 
						||
    )
 | 
						||
    def apply(self, func, *args, **kwargs):
 | 
						||
        return super().apply(func, *args, **kwargs)
 | 
						||
 | 
						||
    @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
 | 
						||
    def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
 | 
						||
 | 
						||
        if maybe_use_numba(engine):
 | 
						||
            with self._group_selection_context():
 | 
						||
                data = self._selected_obj
 | 
						||
            result = self._aggregate_with_numba(
 | 
						||
                data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
 | 
						||
            )
 | 
						||
            index = self.grouper.result_index
 | 
						||
            return self.obj._constructor(result.ravel(), index=index, name=data.name)
 | 
						||
 | 
						||
        relabeling = func is None
 | 
						||
        columns = None
 | 
						||
        if relabeling:
 | 
						||
            columns, func = validate_func_kwargs(kwargs)
 | 
						||
            kwargs = {}
 | 
						||
 | 
						||
        if isinstance(func, str):
 | 
						||
            return getattr(self, func)(*args, **kwargs)
 | 
						||
 | 
						||
        elif isinstance(func, abc.Iterable):
 | 
						||
            # Catch instances of lists / tuples
 | 
						||
            # but not the class list / tuple itself.
 | 
						||
            func = maybe_mangle_lambdas(func)
 | 
						||
            ret = self._aggregate_multiple_funcs(func)
 | 
						||
            if relabeling:
 | 
						||
                # error: Incompatible types in assignment (expression has type
 | 
						||
                # "Optional[List[str]]", variable has type "Index")
 | 
						||
                ret.columns = columns  # type: ignore[assignment]
 | 
						||
            return ret
 | 
						||
 | 
						||
        else:
 | 
						||
            cyfunc = com.get_cython_func(func)
 | 
						||
            if cyfunc and not args and not kwargs:
 | 
						||
                return getattr(self, cyfunc)()
 | 
						||
 | 
						||
            if self.grouper.nkeys > 1:
 | 
						||
                return self._python_agg_general(func, *args, **kwargs)
 | 
						||
 | 
						||
            try:
 | 
						||
                return self._python_agg_general(func, *args, **kwargs)
 | 
						||
            except KeyError:
 | 
						||
                # TODO: KeyError is raised in _python_agg_general,
 | 
						||
                #  see test_groupby.test_basic
 | 
						||
                result = self._aggregate_named(func, *args, **kwargs)
 | 
						||
 | 
						||
                # result is a dict whose keys are the elements of result_index
 | 
						||
                index = self.grouper.result_index
 | 
						||
                return create_series_with_explicit_dtype(
 | 
						||
                    result, index=index, dtype_if_empty=object
 | 
						||
                )
 | 
						||
 | 
						||
    agg = aggregate
 | 
						||
 | 
						||
    def _aggregate_multiple_funcs(self, arg) -> DataFrame:
 | 
						||
        if isinstance(arg, dict):
 | 
						||
 | 
						||
            # show the deprecation, but only if we
 | 
						||
            # have not shown a higher level one
 | 
						||
            # GH 15931
 | 
						||
            raise SpecificationError("nested renamer is not supported")
 | 
						||
 | 
						||
        elif any(isinstance(x, (tuple, list)) for x in arg):
 | 
						||
            arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
 | 
						||
 | 
						||
            # indicated column order
 | 
						||
            columns = next(zip(*arg))
 | 
						||
        else:
 | 
						||
            # list of functions / function names
 | 
						||
            columns = []
 | 
						||
            for f in arg:
 | 
						||
                columns.append(com.get_callable_name(f) or f)
 | 
						||
 | 
						||
            arg = zip(columns, arg)
 | 
						||
 | 
						||
        results: dict[base.OutputKey, DataFrame | Series] = {}
 | 
						||
        for idx, (name, func) in enumerate(arg):
 | 
						||
 | 
						||
            key = base.OutputKey(label=name, position=idx)
 | 
						||
            results[key] = self.aggregate(func)
 | 
						||
 | 
						||
        if any(isinstance(x, DataFrame) for x in results.values()):
 | 
						||
            from pandas import concat
 | 
						||
 | 
						||
            res_df = concat(
 | 
						||
                results.values(), axis=1, keys=[key.label for key in results.keys()]
 | 
						||
            )
 | 
						||
            return res_df
 | 
						||
 | 
						||
        indexed_output = {key.position: val for key, val in results.items()}
 | 
						||
        output = self.obj._constructor_expanddim(indexed_output, index=None)
 | 
						||
        output.columns = Index(key.label for key in results)
 | 
						||
 | 
						||
        output = self._reindex_output(output)
 | 
						||
        return output
 | 
						||
 | 
						||
    def _indexed_output_to_ndframe(
 | 
						||
        self, output: Mapping[base.OutputKey, ArrayLike]
 | 
						||
    ) -> Series:
 | 
						||
        """
 | 
						||
        Wrap the dict result of a GroupBy aggregation into a Series.
 | 
						||
        """
 | 
						||
        assert len(output) == 1
 | 
						||
        values = next(iter(output.values()))
 | 
						||
        result = self.obj._constructor(values)
 | 
						||
        result.name = self.obj.name
 | 
						||
        return result
 | 
						||
 | 
						||
    def _wrap_applied_output(
 | 
						||
        self,
 | 
						||
        data: Series,
 | 
						||
        values: list[Any],
 | 
						||
        not_indexed_same: bool = False,
 | 
						||
    ) -> DataFrame | Series:
 | 
						||
        """
 | 
						||
        Wrap the output of SeriesGroupBy.apply into the expected result.
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        data : Series
 | 
						||
            Input data for groupby operation.
 | 
						||
        values : List[Any]
 | 
						||
            Applied output for each group.
 | 
						||
        not_indexed_same : bool, default False
 | 
						||
            Whether the applied outputs are not indexed the same as the group axes.
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        DataFrame or Series
 | 
						||
        """
 | 
						||
        if len(values) == 0:
 | 
						||
            # GH #6265
 | 
						||
            return self.obj._constructor(
 | 
						||
                [],
 | 
						||
                name=self.obj.name,
 | 
						||
                index=self.grouper.result_index,
 | 
						||
                dtype=data.dtype,
 | 
						||
            )
 | 
						||
        assert values is not None
 | 
						||
 | 
						||
        if isinstance(values[0], dict):
 | 
						||
            # GH #823 #24880
 | 
						||
            index = self.grouper.result_index
 | 
						||
            res_df = self.obj._constructor_expanddim(values, index=index)
 | 
						||
            res_df = self._reindex_output(res_df)
 | 
						||
            # if self.observed is False,
 | 
						||
            # keep all-NaN rows created while re-indexing
 | 
						||
            res_ser = res_df.stack(dropna=self.observed)
 | 
						||
            res_ser.name = self.obj.name
 | 
						||
            return res_ser
 | 
						||
        elif isinstance(values[0], (Series, DataFrame)):
 | 
						||
            return self._concat_objects(values, not_indexed_same=not_indexed_same)
 | 
						||
        else:
 | 
						||
            # GH #6265 #24880
 | 
						||
            result = self.obj._constructor(
 | 
						||
                data=values, index=self.grouper.result_index, name=self.obj.name
 | 
						||
            )
 | 
						||
            return self._reindex_output(result)
 | 
						||
 | 
						||
    def _aggregate_named(self, func, *args, **kwargs):
 | 
						||
        # Note: this is very similar to _aggregate_series_pure_python,
 | 
						||
        #  but that does not pin group.name
 | 
						||
        result = {}
 | 
						||
        initialized = False
 | 
						||
 | 
						||
        for name, group in self:
 | 
						||
            object.__setattr__(group, "name", name)
 | 
						||
 | 
						||
            output = func(group, *args, **kwargs)
 | 
						||
            output = libreduction.extract_result(output)
 | 
						||
            if not initialized:
 | 
						||
                # We only do this validation on the first iteration
 | 
						||
                libreduction.check_result_array(output, group.dtype)
 | 
						||
                initialized = True
 | 
						||
            result[name] = output
 | 
						||
 | 
						||
        return result
 | 
						||
 | 
						||
    @Substitution(klass="Series")
 | 
						||
    @Appender(_transform_template)
 | 
						||
    def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 | 
						||
        return self._transform(
 | 
						||
            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
 | 
						||
        )
 | 
						||
 | 
						||
    def _cython_transform(
 | 
						||
        self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
 | 
						||
    ):
 | 
						||
        assert axis == 0  # handled by caller
 | 
						||
 | 
						||
        obj = self._selected_obj
 | 
						||
 | 
						||
        try:
 | 
						||
            result = self.grouper._cython_operation(
 | 
						||
                "transform", obj._values, how, axis, **kwargs
 | 
						||
            )
 | 
						||
        except NotImplementedError as err:
 | 
						||
            raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
 | 
						||
 | 
						||
        return obj._constructor(result, index=self.obj.index, name=obj.name)
 | 
						||
 | 
						||
    def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
 | 
						||
        """
 | 
						||
        Transform with a callable func`.
 | 
						||
        """
 | 
						||
        assert callable(func)
 | 
						||
        klass = type(self.obj)
 | 
						||
 | 
						||
        results = []
 | 
						||
        for name, group in self:
 | 
						||
            # this setattr is needed for test_transform_lambda_with_datetimetz
 | 
						||
            object.__setattr__(group, "name", name)
 | 
						||
            res = func(group, *args, **kwargs)
 | 
						||
 | 
						||
            results.append(klass(res, index=group.index))
 | 
						||
 | 
						||
        # check for empty "results" to avoid concat ValueError
 | 
						||
        if results:
 | 
						||
            from pandas.core.reshape.concat import concat
 | 
						||
 | 
						||
            concatenated = concat(results)
 | 
						||
            result = self._set_result_index_ordered(concatenated)
 | 
						||
        else:
 | 
						||
            result = self.obj._constructor(dtype=np.float64)
 | 
						||
 | 
						||
        result.name = self.obj.name
 | 
						||
        return result
 | 
						||
 | 
						||
    def _can_use_transform_fast(self, result) -> bool:
 | 
						||
        return True
 | 
						||
 | 
						||
    def filter(self, func, dropna: bool = True, *args, **kwargs):
 | 
						||
        """
 | 
						||
        Return a copy of a Series excluding elements from groups that
 | 
						||
        do not satisfy the boolean criterion specified by func.
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        func : function
 | 
						||
            To apply to each group. Should return True or False.
 | 
						||
        dropna : Drop groups that do not pass the filter. True by default;
 | 
						||
            if False, groups that evaluate False are filled with NaNs.
 | 
						||
 | 
						||
        Notes
 | 
						||
        -----
 | 
						||
        Functions that mutate the passed object can produce unexpected
 | 
						||
        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
 | 
						||
        for more details.
 | 
						||
 | 
						||
        Examples
 | 
						||
        --------
 | 
						||
        >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
 | 
						||
        ...                           'foo', 'bar'],
 | 
						||
        ...                    'B' : [1, 2, 3, 4, 5, 6],
 | 
						||
        ...                    'C' : [2.0, 5., 8., 1., 2., 9.]})
 | 
						||
        >>> grouped = df.groupby('A')
 | 
						||
        >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
 | 
						||
        1    2
 | 
						||
        3    4
 | 
						||
        5    6
 | 
						||
        Name: B, dtype: int64
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        filtered : Series
 | 
						||
        """
 | 
						||
        if isinstance(func, str):
 | 
						||
            wrapper = lambda x: getattr(x, func)(*args, **kwargs)
 | 
						||
        else:
 | 
						||
            wrapper = lambda x: func(x, *args, **kwargs)
 | 
						||
 | 
						||
        # Interpret np.nan as False.
 | 
						||
        def true_and_notna(x) -> bool:
 | 
						||
            b = wrapper(x)
 | 
						||
            return b and notna(b)
 | 
						||
 | 
						||
        try:
 | 
						||
            indices = [
 | 
						||
                self._get_index(name) for name, group in self if true_and_notna(group)
 | 
						||
            ]
 | 
						||
        except (ValueError, TypeError) as err:
 | 
						||
            raise TypeError("the filter must return a boolean result") from err
 | 
						||
 | 
						||
        filtered = self._apply_filter(indices, dropna)
 | 
						||
        return filtered
 | 
						||
 | 
						||
    def nunique(self, dropna: bool = True) -> Series:
 | 
						||
        """
 | 
						||
        Return number of unique elements in the group.
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        Series
 | 
						||
            Number of unique values within each group.
 | 
						||
        """
 | 
						||
        ids, _, _ = self.grouper.group_info
 | 
						||
 | 
						||
        val = self.obj._values
 | 
						||
 | 
						||
        codes, _ = algorithms.factorize(val, sort=False)
 | 
						||
        sorter = np.lexsort((codes, ids))
 | 
						||
        codes = codes[sorter]
 | 
						||
        ids = ids[sorter]
 | 
						||
 | 
						||
        # group boundaries are where group ids change
 | 
						||
        # unique observations are where sorted values change
 | 
						||
        idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
 | 
						||
        inc = np.r_[1, codes[1:] != codes[:-1]]
 | 
						||
 | 
						||
        # 1st item of each group is a new unique observation
 | 
						||
        mask = codes == -1
 | 
						||
        if dropna:
 | 
						||
            inc[idx] = 1
 | 
						||
            inc[mask] = 0
 | 
						||
        else:
 | 
						||
            inc[mask & np.r_[False, mask[:-1]]] = 0
 | 
						||
            inc[idx] = 1
 | 
						||
 | 
						||
        out = np.add.reduceat(inc, idx).astype("int64", copy=False)
 | 
						||
        if len(ids):
 | 
						||
            # NaN/NaT group exists if the head of ids is -1,
 | 
						||
            # so remove it from res and exclude its index from idx
 | 
						||
            if ids[0] == -1:
 | 
						||
                res = out[1:]
 | 
						||
                idx = idx[np.flatnonzero(idx)]
 | 
						||
            else:
 | 
						||
                res = out
 | 
						||
        else:
 | 
						||
            res = out[1:]
 | 
						||
        ri = self.grouper.result_index
 | 
						||
 | 
						||
        # we might have duplications among the bins
 | 
						||
        if len(res) != len(ri):
 | 
						||
            res, out = np.zeros(len(ri), dtype=out.dtype), res
 | 
						||
            res[ids[idx]] = out
 | 
						||
 | 
						||
        result = self.obj._constructor(res, index=ri, name=self.obj.name)
 | 
						||
        return self._reindex_output(result, fill_value=0)
 | 
						||
 | 
						||
    @doc(Series.describe)
 | 
						||
    def describe(self, **kwargs):
 | 
						||
        return super().describe(**kwargs)
 | 
						||
 | 
						||
    def value_counts(
 | 
						||
        self,
 | 
						||
        normalize: bool = False,
 | 
						||
        sort: bool = True,
 | 
						||
        ascending: bool = False,
 | 
						||
        bins=None,
 | 
						||
        dropna: bool = True,
 | 
						||
    ):
 | 
						||
 | 
						||
        from pandas.core.reshape.merge import get_join_indexers
 | 
						||
        from pandas.core.reshape.tile import cut
 | 
						||
 | 
						||
        ids, _, _ = self.grouper.group_info
 | 
						||
        val = self.obj._values
 | 
						||
 | 
						||
        def apply_series_value_counts():
 | 
						||
            return self.apply(
 | 
						||
                Series.value_counts,
 | 
						||
                normalize=normalize,
 | 
						||
                sort=sort,
 | 
						||
                ascending=ascending,
 | 
						||
                bins=bins,
 | 
						||
            )
 | 
						||
 | 
						||
        if bins is not None:
 | 
						||
            if not np.iterable(bins):
 | 
						||
                # scalar bins cannot be done at top level
 | 
						||
                # in a backward compatible way
 | 
						||
                return apply_series_value_counts()
 | 
						||
        elif is_categorical_dtype(val.dtype):
 | 
						||
            # GH38672
 | 
						||
            return apply_series_value_counts()
 | 
						||
 | 
						||
        # groupby removes null keys from groupings
 | 
						||
        mask = ids != -1
 | 
						||
        ids, val = ids[mask], val[mask]
 | 
						||
 | 
						||
        if bins is None:
 | 
						||
            lab, lev = algorithms.factorize(val, sort=True)
 | 
						||
            llab = lambda lab, inc: lab[inc]
 | 
						||
        else:
 | 
						||
 | 
						||
            # lab is a Categorical with categories an IntervalIndex
 | 
						||
            lab = cut(Series(val), bins, include_lowest=True)
 | 
						||
            # error: "ndarray" has no attribute "cat"
 | 
						||
            lev = lab.cat.categories  # type: ignore[attr-defined]
 | 
						||
            # error: No overload variant of "take" of "_ArrayOrScalarCommon" matches
 | 
						||
            # argument types "Any", "bool", "Union[Any, float]"
 | 
						||
            lab = lev.take(  # type: ignore[call-overload]
 | 
						||
                # error: "ndarray" has no attribute "cat"
 | 
						||
                lab.cat.codes,  # type: ignore[attr-defined]
 | 
						||
                allow_fill=True,
 | 
						||
                # error: Item "ndarray" of "Union[ndarray, Index]" has no attribute
 | 
						||
                # "_na_value"
 | 
						||
                fill_value=lev._na_value,  # type: ignore[union-attr]
 | 
						||
            )
 | 
						||
            llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
 | 
						||
 | 
						||
        if is_interval_dtype(lab.dtype):
 | 
						||
            # TODO: should we do this inside II?
 | 
						||
 | 
						||
            # error: "ndarray" has no attribute "left"
 | 
						||
            # error: "ndarray" has no attribute "right"
 | 
						||
            sorter = np.lexsort(
 | 
						||
                (lab.left, lab.right, ids)  # type: ignore[attr-defined]
 | 
						||
            )
 | 
						||
        else:
 | 
						||
            sorter = np.lexsort((lab, ids))
 | 
						||
 | 
						||
        ids, lab = ids[sorter], lab[sorter]
 | 
						||
 | 
						||
        # group boundaries are where group ids change
 | 
						||
        idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
 | 
						||
        idx = np.r_[0, idchanges]
 | 
						||
        if not len(ids):
 | 
						||
            idx = idchanges
 | 
						||
 | 
						||
        # new values are where sorted labels change
 | 
						||
        lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
 | 
						||
        inc = np.r_[True, lchanges]
 | 
						||
        if not len(val):
 | 
						||
            inc = lchanges
 | 
						||
        inc[idx] = True  # group boundaries are also new values
 | 
						||
        out = np.diff(np.nonzero(np.r_[inc, True])[0])  # value counts
 | 
						||
 | 
						||
        # num. of times each group should be repeated
 | 
						||
        rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
 | 
						||
 | 
						||
        # multi-index components
 | 
						||
        codes = self.grouper.reconstructed_codes
 | 
						||
        codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
 | 
						||
        # error: List item 0 has incompatible type "Union[ndarray[Any, Any], Index]";
 | 
						||
        # expected "Index"
 | 
						||
        levels = [ping.group_index for ping in self.grouper.groupings] + [
 | 
						||
            lev  # type: ignore[list-item]
 | 
						||
        ]
 | 
						||
        names = self.grouper.names + [self.obj.name]
 | 
						||
 | 
						||
        if dropna:
 | 
						||
            mask = codes[-1] != -1
 | 
						||
            if mask.all():
 | 
						||
                dropna = False
 | 
						||
            else:
 | 
						||
                out, codes = out[mask], [level_codes[mask] for level_codes in codes]
 | 
						||
 | 
						||
        if normalize:
 | 
						||
            out = out.astype("float")
 | 
						||
            d = np.diff(np.r_[idx, len(ids)])
 | 
						||
            if dropna:
 | 
						||
                m = ids[lab == -1]
 | 
						||
                np.add.at(d, m, -1)
 | 
						||
                acc = rep(d)[mask]
 | 
						||
            else:
 | 
						||
                acc = rep(d)
 | 
						||
            out /= acc
 | 
						||
 | 
						||
        if sort and bins is None:
 | 
						||
            cat = ids[inc][mask] if dropna else ids[inc]
 | 
						||
            sorter = np.lexsort((out if ascending else -out, cat))
 | 
						||
            out, codes[-1] = out[sorter], codes[-1][sorter]
 | 
						||
 | 
						||
        if bins is not None:
 | 
						||
            # for compat. with libgroupby.value_counts need to ensure every
 | 
						||
            # bin is present at every index level, null filled with zeros
 | 
						||
            diff = np.zeros(len(out), dtype="bool")
 | 
						||
            for level_codes in codes[:-1]:
 | 
						||
                diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
 | 
						||
 | 
						||
            ncat, nbin = diff.sum(), len(levels[-1])
 | 
						||
 | 
						||
            left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
 | 
						||
 | 
						||
            right = [diff.cumsum() - 1, codes[-1]]
 | 
						||
 | 
						||
            _, idx = get_join_indexers(left, right, sort=False, how="left")
 | 
						||
            out = np.where(idx != -1, out[idx], 0)
 | 
						||
 | 
						||
            if sort:
 | 
						||
                sorter = np.lexsort((out if ascending else -out, left[0]))
 | 
						||
                out, left[-1] = out[sorter], left[-1][sorter]
 | 
						||
 | 
						||
            # build the multi-index w/ full levels
 | 
						||
            def build_codes(lev_codes: np.ndarray) -> np.ndarray:
 | 
						||
                return np.repeat(lev_codes[diff], nbin)
 | 
						||
 | 
						||
            codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
 | 
						||
            codes.append(left[-1])
 | 
						||
 | 
						||
        mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
 | 
						||
 | 
						||
        if is_integer_dtype(out.dtype):
 | 
						||
            out = ensure_int64(out)
 | 
						||
        return self.obj._constructor(out, index=mi, name=self.obj.name)
 | 
						||
 | 
						||
    @doc(Series.nlargest)
 | 
						||
    def nlargest(self, n: int = 5, keep: str = "first"):
 | 
						||
        f = partial(Series.nlargest, n=n, keep=keep)
 | 
						||
        data = self._obj_with_exclusions
 | 
						||
        # Don't change behavior if result index happens to be the same, i.e.
 | 
						||
        # already ordered and n >= all group sizes.
 | 
						||
        result = self._python_apply_general(f, data, not_indexed_same=True)
 | 
						||
        return result
 | 
						||
 | 
						||
    @doc(Series.nsmallest)
 | 
						||
    def nsmallest(self, n: int = 5, keep: str = "first"):
 | 
						||
        f = partial(Series.nsmallest, n=n, keep=keep)
 | 
						||
        data = self._obj_with_exclusions
 | 
						||
        # Don't change behavior if result index happens to be the same, i.e.
 | 
						||
        # already ordered and n >= all group sizes.
 | 
						||
        result = self._python_apply_general(f, data, not_indexed_same=True)
 | 
						||
        return result
 | 
						||
 | 
						||
 | 
						||
@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)
 | 
						||
class DataFrameGroupBy(GroupBy[DataFrame]):
 | 
						||
 | 
						||
    _apply_allowlist = base.dataframe_apply_allowlist
 | 
						||
 | 
						||
    _agg_examples_doc = dedent(
 | 
						||
        """
 | 
						||
    Examples
 | 
						||
    --------
 | 
						||
    >>> df = pd.DataFrame(
 | 
						||
    ...     {
 | 
						||
    ...         "A": [1, 1, 2, 2],
 | 
						||
    ...         "B": [1, 2, 3, 4],
 | 
						||
    ...         "C": [0.362838, 0.227877, 1.267767, -0.562860],
 | 
						||
    ...     }
 | 
						||
    ... )
 | 
						||
 | 
						||
    >>> df
 | 
						||
       A  B         C
 | 
						||
    0  1  1  0.362838
 | 
						||
    1  1  2  0.227877
 | 
						||
    2  2  3  1.267767
 | 
						||
    3  2  4 -0.562860
 | 
						||
 | 
						||
    The aggregation is for each column.
 | 
						||
 | 
						||
    >>> df.groupby('A').agg('min')
 | 
						||
       B         C
 | 
						||
    A
 | 
						||
    1  1  0.227877
 | 
						||
    2  3 -0.562860
 | 
						||
 | 
						||
    Multiple aggregations
 | 
						||
 | 
						||
    >>> df.groupby('A').agg(['min', 'max'])
 | 
						||
        B             C
 | 
						||
      min max       min       max
 | 
						||
    A
 | 
						||
    1   1   2  0.227877  0.362838
 | 
						||
    2   3   4 -0.562860  1.267767
 | 
						||
 | 
						||
    Select a column for aggregation
 | 
						||
 | 
						||
    >>> df.groupby('A').B.agg(['min', 'max'])
 | 
						||
       min  max
 | 
						||
    A
 | 
						||
    1    1    2
 | 
						||
    2    3    4
 | 
						||
 | 
						||
    Different aggregations per column
 | 
						||
 | 
						||
    >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
 | 
						||
        B             C
 | 
						||
      min max       sum
 | 
						||
    A
 | 
						||
    1   1   2  0.590715
 | 
						||
    2   3   4  0.704907
 | 
						||
 | 
						||
    To control the output names with different aggregations per column,
 | 
						||
    pandas supports "named aggregation"
 | 
						||
 | 
						||
    >>> df.groupby("A").agg(
 | 
						||
    ...     b_min=pd.NamedAgg(column="B", aggfunc="min"),
 | 
						||
    ...     c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
 | 
						||
       b_min     c_sum
 | 
						||
    A
 | 
						||
    1      1  0.590715
 | 
						||
    2      3  0.704907
 | 
						||
 | 
						||
    - The keywords are the *output* column names
 | 
						||
    - The values are tuples whose first element is the column to select
 | 
						||
      and the second element is the aggregation to apply to that column.
 | 
						||
      Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
 | 
						||
      ``['column', 'aggfunc']`` to make it clearer what the arguments are.
 | 
						||
      As usual, the aggregation can be a callable or a string alias.
 | 
						||
 | 
						||
    See :ref:`groupby.aggregate.named` for more.
 | 
						||
 | 
						||
    .. versionchanged:: 1.3.0
 | 
						||
 | 
						||
        The resulting dtype will reflect the return value of the aggregating function.
 | 
						||
 | 
						||
    >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
 | 
						||
          B
 | 
						||
    A
 | 
						||
    1   1.0
 | 
						||
    2   3.0
 | 
						||
    """
 | 
						||
    )
 | 
						||
 | 
						||
    @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
 | 
						||
    def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
 | 
						||
 | 
						||
        if maybe_use_numba(engine):
 | 
						||
            with self._group_selection_context():
 | 
						||
                data = self._selected_obj
 | 
						||
            result = self._aggregate_with_numba(
 | 
						||
                data, func, *args, engine_kwargs=engine_kwargs, **kwargs
 | 
						||
            )
 | 
						||
            index = self.grouper.result_index
 | 
						||
            return self.obj._constructor(result, index=index, columns=data.columns)
 | 
						||
 | 
						||
        relabeling, func, columns, order = reconstruct_func(func, **kwargs)
 | 
						||
        func = maybe_mangle_lambdas(func)
 | 
						||
 | 
						||
        op = GroupByApply(self, func, args, kwargs)
 | 
						||
        result = op.agg()
 | 
						||
        if not is_dict_like(func) and result is not None:
 | 
						||
            return result
 | 
						||
        elif relabeling and result is not None:
 | 
						||
            # this should be the only (non-raising) case with relabeling
 | 
						||
            # used reordered index of columns
 | 
						||
            result = result.iloc[:, order]
 | 
						||
            result.columns = columns
 | 
						||
 | 
						||
        if result is None:
 | 
						||
 | 
						||
            # grouper specific aggregations
 | 
						||
            if self.grouper.nkeys > 1:
 | 
						||
                # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
 | 
						||
                return self._python_agg_general(func, *args, **kwargs)
 | 
						||
            elif args or kwargs:
 | 
						||
                # test_pass_args_kwargs gets here (with and without as_index)
 | 
						||
                # can't return early
 | 
						||
                result = self._aggregate_frame(func, *args, **kwargs)
 | 
						||
 | 
						||
            elif self.axis == 1:
 | 
						||
                # _aggregate_multiple_funcs does not allow self.axis == 1
 | 
						||
                # Note: axis == 1 precludes 'not self.as_index', see __init__
 | 
						||
                result = self._aggregate_frame(func)
 | 
						||
                return result
 | 
						||
 | 
						||
            else:
 | 
						||
 | 
						||
                # try to treat as if we are passing a list
 | 
						||
                gba = GroupByApply(self, [func], args=(), kwargs={})
 | 
						||
                try:
 | 
						||
                    result = gba.agg()
 | 
						||
 | 
						||
                except ValueError as err:
 | 
						||
                    if "no results" not in str(err):
 | 
						||
                        # raised directly by _aggregate_multiple_funcs
 | 
						||
                        raise
 | 
						||
                    result = self._aggregate_frame(func)
 | 
						||
 | 
						||
                else:
 | 
						||
                    sobj = self._selected_obj
 | 
						||
 | 
						||
                    if isinstance(sobj, Series):
 | 
						||
                        # GH#35246 test_groupby_as_index_select_column_sum_empty_df
 | 
						||
                        result.columns = self._obj_with_exclusions.columns.copy()
 | 
						||
                    else:
 | 
						||
                        # Retain our column names
 | 
						||
                        result.columns._set_names(
 | 
						||
                            sobj.columns.names, level=list(range(sobj.columns.nlevels))
 | 
						||
                        )
 | 
						||
                        # select everything except for the last level, which is the one
 | 
						||
                        # containing the name of the function(s), see GH#32040
 | 
						||
                        result.columns = result.columns.droplevel(-1)
 | 
						||
 | 
						||
        if not self.as_index:
 | 
						||
            self._insert_inaxis_grouper_inplace(result)
 | 
						||
            result.index = Index(range(len(result)))
 | 
						||
 | 
						||
        return result
 | 
						||
 | 
						||
    agg = aggregate
 | 
						||
 | 
						||
    def _iterate_slices(self) -> Iterable[Series]:
 | 
						||
        obj = self._selected_obj
 | 
						||
        if self.axis == 1:
 | 
						||
            obj = obj.T
 | 
						||
 | 
						||
        if isinstance(obj, Series) and obj.name not in self.exclusions:
 | 
						||
            # Occurs when doing DataFrameGroupBy(...)["X"]
 | 
						||
            yield obj
 | 
						||
        else:
 | 
						||
            for label, values in obj.items():
 | 
						||
                if label in self.exclusions:
 | 
						||
                    continue
 | 
						||
 | 
						||
                yield values
 | 
						||
 | 
						||
    def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
 | 
						||
        if self.grouper.nkeys != 1:
 | 
						||
            raise AssertionError("Number of keys must be 1")
 | 
						||
 | 
						||
        obj = self._obj_with_exclusions
 | 
						||
 | 
						||
        result: dict[Hashable, NDFrame | np.ndarray] = {}
 | 
						||
        if self.axis == 0:
 | 
						||
            # test_pass_args_kwargs_duplicate_columns gets here with non-unique columns
 | 
						||
            for name, data in self:
 | 
						||
                fres = func(data, *args, **kwargs)
 | 
						||
                result[name] = fres
 | 
						||
        else:
 | 
						||
            # we get here in a number of test_multilevel tests
 | 
						||
            for name in self.indices:
 | 
						||
                grp_df = self.get_group(name, obj=obj)
 | 
						||
                fres = func(grp_df, *args, **kwargs)
 | 
						||
                result[name] = fres
 | 
						||
 | 
						||
        result_index = self.grouper.result_index
 | 
						||
        other_ax = obj.axes[1 - self.axis]
 | 
						||
        out = self.obj._constructor(result, index=other_ax, columns=result_index)
 | 
						||
        if self.axis == 0:
 | 
						||
            out = out.T
 | 
						||
 | 
						||
        return out
 | 
						||
 | 
						||
    def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
 | 
						||
        # only for axis==0
 | 
						||
        # tests that get here with non-unique cols:
 | 
						||
        #  test_resample_with_timedelta_yields_no_empty_groups,
 | 
						||
        #  test_resample_apply_product
 | 
						||
 | 
						||
        obj = self._obj_with_exclusions
 | 
						||
        result: dict[int, NDFrame] = {}
 | 
						||
 | 
						||
        for i, (item, sgb) in enumerate(self._iterate_column_groupbys(obj)):
 | 
						||
            result[i] = sgb.aggregate(func, *args, **kwargs)
 | 
						||
 | 
						||
        res_df = self.obj._constructor(result)
 | 
						||
        res_df.columns = obj.columns
 | 
						||
        return res_df
 | 
						||
 | 
						||
    def _wrap_applied_output(
 | 
						||
        self, data: DataFrame, values: list, not_indexed_same: bool = False
 | 
						||
    ):
 | 
						||
 | 
						||
        if len(values) == 0:
 | 
						||
            result = self.obj._constructor(
 | 
						||
                index=self.grouper.result_index, columns=data.columns
 | 
						||
            )
 | 
						||
            result = result.astype(data.dtypes, copy=False)
 | 
						||
            return result
 | 
						||
 | 
						||
        # GH12824
 | 
						||
        first_not_none = next(com.not_none(*values), None)
 | 
						||
 | 
						||
        if first_not_none is None:
 | 
						||
            # GH9684 - All values are None, return an empty frame.
 | 
						||
            return self.obj._constructor()
 | 
						||
        elif isinstance(first_not_none, DataFrame):
 | 
						||
            return self._concat_objects(values, not_indexed_same=not_indexed_same)
 | 
						||
 | 
						||
        key_index = self.grouper.result_index if self.as_index else None
 | 
						||
 | 
						||
        if isinstance(first_not_none, (np.ndarray, Index)):
 | 
						||
            # GH#1738: values is list of arrays of unequal lengths
 | 
						||
            #  fall through to the outer else clause
 | 
						||
            # TODO: sure this is right?  we used to do this
 | 
						||
            #  after raising AttributeError above
 | 
						||
            return self.obj._constructor_sliced(
 | 
						||
                values, index=key_index, name=self._selection
 | 
						||
            )
 | 
						||
        elif not isinstance(first_not_none, Series):
 | 
						||
            # values are not series or array-like but scalars
 | 
						||
            # self._selection not passed through to Series as the
 | 
						||
            # result should not take the name of original selection
 | 
						||
            # of columns
 | 
						||
            if self.as_index:
 | 
						||
                return self.obj._constructor_sliced(values, index=key_index)
 | 
						||
            else:
 | 
						||
                result = self.obj._constructor(values, columns=[self._selection])
 | 
						||
                self._insert_inaxis_grouper_inplace(result)
 | 
						||
                return result
 | 
						||
        else:
 | 
						||
            # values are Series
 | 
						||
            return self._wrap_applied_output_series(
 | 
						||
                values, not_indexed_same, first_not_none, key_index
 | 
						||
            )
 | 
						||
 | 
						||
    def _wrap_applied_output_series(
 | 
						||
        self,
 | 
						||
        values: list[Series],
 | 
						||
        not_indexed_same: bool,
 | 
						||
        first_not_none,
 | 
						||
        key_index,
 | 
						||
    ) -> DataFrame | Series:
 | 
						||
        # this is to silence a DeprecationWarning
 | 
						||
        # TODO(2.0): Remove when default dtype of empty Series is object
 | 
						||
        kwargs = first_not_none._construct_axes_dict()
 | 
						||
        backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs)
 | 
						||
        values = [x if (x is not None) else backup for x in values]
 | 
						||
 | 
						||
        all_indexed_same = all_indexes_same(x.index for x in values)
 | 
						||
 | 
						||
        # GH3596
 | 
						||
        # provide a reduction (Frame -> Series) if groups are
 | 
						||
        # unique
 | 
						||
        if self.squeeze:
 | 
						||
            applied_index = self._selected_obj._get_axis(self.axis)
 | 
						||
            singular_series = len(values) == 1 and applied_index.nlevels == 1
 | 
						||
 | 
						||
            if singular_series:
 | 
						||
                # GH2893
 | 
						||
                # we have series in the values array, we want to
 | 
						||
                # produce a series:
 | 
						||
                # if any of the sub-series are not indexed the same
 | 
						||
                # OR we don't have a multi-index and we have only a
 | 
						||
                # single values
 | 
						||
                return self._concat_objects(values, not_indexed_same=not_indexed_same)
 | 
						||
 | 
						||
            # still a series
 | 
						||
            # path added as of GH 5545
 | 
						||
            elif all_indexed_same:
 | 
						||
                from pandas.core.reshape.concat import concat
 | 
						||
 | 
						||
                return concat(values)
 | 
						||
 | 
						||
        if not all_indexed_same:
 | 
						||
            # GH 8467
 | 
						||
            return self._concat_objects(values, not_indexed_same=True)
 | 
						||
 | 
						||
        # Combine values
 | 
						||
        # vstack+constructor is faster than concat and handles MI-columns
 | 
						||
        stacked_values = np.vstack([np.asarray(v) for v in values])
 | 
						||
 | 
						||
        if self.axis == 0:
 | 
						||
            index = key_index
 | 
						||
            columns = first_not_none.index.copy()
 | 
						||
            if columns.name is None:
 | 
						||
                # GH6124 - propagate name of Series when it's consistent
 | 
						||
                names = {v.name for v in values}
 | 
						||
                if len(names) == 1:
 | 
						||
                    columns.name = list(names)[0]
 | 
						||
        else:
 | 
						||
            index = first_not_none.index
 | 
						||
            columns = key_index
 | 
						||
            stacked_values = stacked_values.T
 | 
						||
 | 
						||
        if stacked_values.dtype == object:
 | 
						||
            # We'll have the DataFrame constructor do inference
 | 
						||
            stacked_values = stacked_values.tolist()
 | 
						||
        result = self.obj._constructor(stacked_values, index=index, columns=columns)
 | 
						||
 | 
						||
        if not self.as_index:
 | 
						||
            self._insert_inaxis_grouper_inplace(result)
 | 
						||
 | 
						||
        return self._reindex_output(result)
 | 
						||
 | 
						||
    def _cython_transform(
 | 
						||
        self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
 | 
						||
    ) -> DataFrame:
 | 
						||
        assert axis == 0  # handled by caller
 | 
						||
        # TODO: no tests with self.ndim == 1 for DataFrameGroupBy
 | 
						||
 | 
						||
        # With self.axis == 0, we have multi-block tests
 | 
						||
        #  e.g. test_rank_min_int, test_cython_transform_frame
 | 
						||
        #  test_transform_numeric_ret
 | 
						||
        # With self.axis == 1, _get_data_to_aggregate does a transpose
 | 
						||
        #  so we always have a single block.
 | 
						||
        mgr: Manager2D = self._get_data_to_aggregate()
 | 
						||
        if numeric_only:
 | 
						||
            mgr = mgr.get_numeric_data(copy=False)
 | 
						||
 | 
						||
        def arr_func(bvalues: ArrayLike) -> ArrayLike:
 | 
						||
            return self.grouper._cython_operation(
 | 
						||
                "transform", bvalues, how, 1, **kwargs
 | 
						||
            )
 | 
						||
 | 
						||
        # We could use `mgr.apply` here and not have to set_axis, but
 | 
						||
        #  we would have to do shape gymnastics for ArrayManager compat
 | 
						||
        res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
 | 
						||
        res_mgr.set_axis(1, mgr.axes[1])
 | 
						||
 | 
						||
        if len(res_mgr) < len(mgr):
 | 
						||
            warn_dropping_nuisance_columns_deprecated(type(self), how)
 | 
						||
 | 
						||
        res_df = self.obj._constructor(res_mgr)
 | 
						||
        if self.axis == 1:
 | 
						||
            res_df = res_df.T
 | 
						||
        return res_df
 | 
						||
 | 
						||
    def _transform_general(self, func, *args, **kwargs):
 | 
						||
        from pandas.core.reshape.concat import concat
 | 
						||
 | 
						||
        applied = []
 | 
						||
        obj = self._obj_with_exclusions
 | 
						||
        gen = self.grouper.get_iterator(obj, axis=self.axis)
 | 
						||
        fast_path, slow_path = self._define_paths(func, *args, **kwargs)
 | 
						||
 | 
						||
        # Determine whether to use slow or fast path by evaluating on the first group.
 | 
						||
        # Need to handle the case of an empty generator and process the result so that
 | 
						||
        # it does not need to be computed again.
 | 
						||
        try:
 | 
						||
            name, group = next(gen)
 | 
						||
        except StopIteration:
 | 
						||
            pass
 | 
						||
        else:
 | 
						||
            object.__setattr__(group, "name", name)
 | 
						||
            try:
 | 
						||
                path, res = self._choose_path(fast_path, slow_path, group)
 | 
						||
            except TypeError:
 | 
						||
                return self._transform_item_by_item(obj, fast_path)
 | 
						||
            except ValueError as err:
 | 
						||
                msg = "transform must return a scalar value for each group"
 | 
						||
                raise ValueError(msg) from err
 | 
						||
            if group.size > 0:
 | 
						||
                res = _wrap_transform_general_frame(self.obj, group, res)
 | 
						||
                applied.append(res)
 | 
						||
 | 
						||
        # Compute and process with the remaining groups
 | 
						||
        for name, group in gen:
 | 
						||
            if group.size == 0:
 | 
						||
                continue
 | 
						||
            object.__setattr__(group, "name", name)
 | 
						||
            res = path(group)
 | 
						||
            res = _wrap_transform_general_frame(self.obj, group, res)
 | 
						||
            applied.append(res)
 | 
						||
 | 
						||
        concat_index = obj.columns if self.axis == 0 else obj.index
 | 
						||
        other_axis = 1 if self.axis == 0 else 0  # switches between 0 & 1
 | 
						||
        concatenated = concat(applied, axis=self.axis, verify_integrity=False)
 | 
						||
        concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
 | 
						||
        return self._set_result_index_ordered(concatenated)
 | 
						||
 | 
						||
    @Substitution(klass="DataFrame")
 | 
						||
    @Appender(_transform_template)
 | 
						||
    def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 | 
						||
        return self._transform(
 | 
						||
            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
 | 
						||
        )
 | 
						||
 | 
						||
    def _can_use_transform_fast(self, result) -> bool:
 | 
						||
        return isinstance(result, DataFrame) and result.columns.equals(
 | 
						||
            self._obj_with_exclusions.columns
 | 
						||
        )
 | 
						||
 | 
						||
    def _define_paths(self, func, *args, **kwargs):
 | 
						||
        if isinstance(func, str):
 | 
						||
            fast_path = lambda group: getattr(group, func)(*args, **kwargs)
 | 
						||
            slow_path = lambda group: group.apply(
 | 
						||
                lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
 | 
						||
            )
 | 
						||
        else:
 | 
						||
            fast_path = lambda group: func(group, *args, **kwargs)
 | 
						||
            slow_path = lambda group: group.apply(
 | 
						||
                lambda x: func(x, *args, **kwargs), axis=self.axis
 | 
						||
            )
 | 
						||
        return fast_path, slow_path
 | 
						||
 | 
						||
    def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
 | 
						||
        path = slow_path
 | 
						||
        res = slow_path(group)
 | 
						||
 | 
						||
        # if we make it here, test if we can use the fast path
 | 
						||
        try:
 | 
						||
            res_fast = fast_path(group)
 | 
						||
        except AssertionError:
 | 
						||
            raise  # pragma: no cover
 | 
						||
        except Exception:
 | 
						||
            # GH#29631 For user-defined function, we can't predict what may be
 | 
						||
            #  raised; see test_transform.test_transform_fastpath_raises
 | 
						||
            return path, res
 | 
						||
 | 
						||
        # verify fast path does not change columns (and names), otherwise
 | 
						||
        # its results cannot be joined with those of the slow path
 | 
						||
        if not isinstance(res_fast, DataFrame):
 | 
						||
            return path, res
 | 
						||
 | 
						||
        if not res_fast.columns.equals(group.columns):
 | 
						||
            return path, res
 | 
						||
 | 
						||
        if res_fast.equals(res):
 | 
						||
            path = fast_path
 | 
						||
 | 
						||
        return path, res
 | 
						||
 | 
						||
    def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
 | 
						||
        # iterate through columns, see test_transform_exclude_nuisance
 | 
						||
        #  gets here with non-unique columns
 | 
						||
        output = {}
 | 
						||
        inds = []
 | 
						||
        for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
 | 
						||
            try:
 | 
						||
                output[i] = sgb.transform(wrapper)
 | 
						||
            except TypeError:
 | 
						||
                # e.g. trying to call nanmean with string values
 | 
						||
                warn_dropping_nuisance_columns_deprecated(type(self), "transform")
 | 
						||
            else:
 | 
						||
                inds.append(i)
 | 
						||
 | 
						||
        if not output:
 | 
						||
            raise TypeError("Transform function invalid for data types")
 | 
						||
 | 
						||
        columns = obj.columns.take(inds)
 | 
						||
 | 
						||
        result = self.obj._constructor(output, index=obj.index)
 | 
						||
        result.columns = columns
 | 
						||
        return result
 | 
						||
 | 
						||
    def filter(self, func, dropna=True, *args, **kwargs):
 | 
						||
        """
 | 
						||
        Return a copy of a DataFrame excluding filtered elements.
 | 
						||
 | 
						||
        Elements from groups are filtered if they do not satisfy the
 | 
						||
        boolean criterion specified by func.
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        func : function
 | 
						||
            Function to apply to each subframe. Should return True or False.
 | 
						||
        dropna : Drop groups that do not pass the filter. True by default;
 | 
						||
            If False, groups that evaluate False are filled with NaNs.
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        filtered : DataFrame
 | 
						||
 | 
						||
        Notes
 | 
						||
        -----
 | 
						||
        Each subframe is endowed the attribute 'name' in case you need to know
 | 
						||
        which group you are working on.
 | 
						||
 | 
						||
        Functions that mutate the passed object can produce unexpected
 | 
						||
        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
 | 
						||
        for more details.
 | 
						||
 | 
						||
        Examples
 | 
						||
        --------
 | 
						||
        >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
 | 
						||
        ...                           'foo', 'bar'],
 | 
						||
        ...                    'B' : [1, 2, 3, 4, 5, 6],
 | 
						||
        ...                    'C' : [2.0, 5., 8., 1., 2., 9.]})
 | 
						||
        >>> grouped = df.groupby('A')
 | 
						||
        >>> grouped.filter(lambda x: x['B'].mean() > 3.)
 | 
						||
             A  B    C
 | 
						||
        1  bar  2  5.0
 | 
						||
        3  bar  4  1.0
 | 
						||
        5  bar  6  9.0
 | 
						||
        """
 | 
						||
        indices = []
 | 
						||
 | 
						||
        obj = self._selected_obj
 | 
						||
        gen = self.grouper.get_iterator(obj, axis=self.axis)
 | 
						||
 | 
						||
        for name, group in gen:
 | 
						||
            object.__setattr__(group, "name", name)
 | 
						||
 | 
						||
            res = func(group, *args, **kwargs)
 | 
						||
 | 
						||
            try:
 | 
						||
                res = res.squeeze()
 | 
						||
            except AttributeError:  # allow e.g., scalars and frames to pass
 | 
						||
                pass
 | 
						||
 | 
						||
            # interpret the result of the filter
 | 
						||
            if is_bool(res) or (is_scalar(res) and isna(res)):
 | 
						||
                if res and notna(res):
 | 
						||
                    indices.append(self._get_index(name))
 | 
						||
            else:
 | 
						||
                # non scalars aren't allowed
 | 
						||
                raise TypeError(
 | 
						||
                    f"filter function returned a {type(res).__name__}, "
 | 
						||
                    "but expected a scalar bool"
 | 
						||
                )
 | 
						||
 | 
						||
        return self._apply_filter(indices, dropna)
 | 
						||
 | 
						||
    def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
 | 
						||
        if self.axis == 1:
 | 
						||
            # GH 37725
 | 
						||
            raise ValueError("Cannot subset columns when using axis=1")
 | 
						||
        # per GH 23566
 | 
						||
        if isinstance(key, tuple) and len(key) > 1:
 | 
						||
            # if len == 1, then it becomes a SeriesGroupBy and this is actually
 | 
						||
            # valid syntax, so don't raise warning
 | 
						||
            warnings.warn(
 | 
						||
                "Indexing with multiple keys (implicitly converted to a tuple "
 | 
						||
                "of keys) will be deprecated, use a list instead.",
 | 
						||
                FutureWarning,
 | 
						||
                stacklevel=find_stack_level(),
 | 
						||
            )
 | 
						||
        return super().__getitem__(key)
 | 
						||
 | 
						||
    def _gotitem(self, key, ndim: int, subset=None):
 | 
						||
        """
 | 
						||
        sub-classes to define
 | 
						||
        return a sliced object
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        key : string / list of selections
 | 
						||
        ndim : {1, 2}
 | 
						||
            requested ndim of result
 | 
						||
        subset : object, default None
 | 
						||
            subset to act on
 | 
						||
        """
 | 
						||
        if ndim == 2:
 | 
						||
            if subset is None:
 | 
						||
                subset = self.obj
 | 
						||
            return DataFrameGroupBy(
 | 
						||
                subset,
 | 
						||
                self.grouper,
 | 
						||
                axis=self.axis,
 | 
						||
                level=self.level,
 | 
						||
                grouper=self.grouper,
 | 
						||
                exclusions=self.exclusions,
 | 
						||
                selection=key,
 | 
						||
                as_index=self.as_index,
 | 
						||
                sort=self.sort,
 | 
						||
                group_keys=self.group_keys,
 | 
						||
                squeeze=self.squeeze,
 | 
						||
                observed=self.observed,
 | 
						||
                mutated=self.mutated,
 | 
						||
                dropna=self.dropna,
 | 
						||
            )
 | 
						||
        elif ndim == 1:
 | 
						||
            if subset is None:
 | 
						||
                subset = self.obj[key]
 | 
						||
            return SeriesGroupBy(
 | 
						||
                subset,
 | 
						||
                level=self.level,
 | 
						||
                grouper=self.grouper,
 | 
						||
                selection=key,
 | 
						||
                sort=self.sort,
 | 
						||
                group_keys=self.group_keys,
 | 
						||
                squeeze=self.squeeze,
 | 
						||
                observed=self.observed,
 | 
						||
                dropna=self.dropna,
 | 
						||
            )
 | 
						||
 | 
						||
        raise AssertionError("invalid ndim for _gotitem")
 | 
						||
 | 
						||
    def _get_data_to_aggregate(self) -> Manager2D:
 | 
						||
        obj = self._obj_with_exclusions
 | 
						||
        if self.axis == 1:
 | 
						||
            return obj.T._mgr
 | 
						||
        else:
 | 
						||
            return obj._mgr
 | 
						||
 | 
						||
    def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
 | 
						||
        # zip in reverse so we can always insert at loc 0
 | 
						||
        columns = result.columns
 | 
						||
        for name, lev, in_axis in zip(
 | 
						||
            reversed(self.grouper.names),
 | 
						||
            reversed(self.grouper.get_group_levels()),
 | 
						||
            reversed([grp.in_axis for grp in self.grouper.groupings]),
 | 
						||
        ):
 | 
						||
            # GH #28549
 | 
						||
            # When using .apply(-), name will be in columns already
 | 
						||
            if in_axis and name not in columns:
 | 
						||
                result.insert(0, name, lev)
 | 
						||
 | 
						||
    def _indexed_output_to_ndframe(
 | 
						||
        self, output: Mapping[base.OutputKey, ArrayLike]
 | 
						||
    ) -> DataFrame:
 | 
						||
        """
 | 
						||
        Wrap the dict result of a GroupBy aggregation into a DataFrame.
 | 
						||
        """
 | 
						||
        indexed_output = {key.position: val for key, val in output.items()}
 | 
						||
        columns = Index([key.label for key in output])
 | 
						||
        columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
 | 
						||
 | 
						||
        result = self.obj._constructor(indexed_output)
 | 
						||
        result.columns = columns
 | 
						||
        return result
 | 
						||
 | 
						||
    def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
 | 
						||
        if not self.as_index:
 | 
						||
            # GH 41998 - empty mgr always gets index of length 0
 | 
						||
            rows = mgr.shape[1] if mgr.shape[0] > 0 else 0
 | 
						||
            index = Index(range(rows))
 | 
						||
            mgr.set_axis(1, index)
 | 
						||
            result = self.obj._constructor(mgr)
 | 
						||
 | 
						||
            self._insert_inaxis_grouper_inplace(result)
 | 
						||
            result = result._consolidate()
 | 
						||
        else:
 | 
						||
            index = self.grouper.result_index
 | 
						||
            mgr.set_axis(1, index)
 | 
						||
            result = self.obj._constructor(mgr)
 | 
						||
 | 
						||
        if self.axis == 1:
 | 
						||
            result = result.T
 | 
						||
 | 
						||
        # Note: we only need to pass datetime=True in order to get numeric
 | 
						||
        #  values converted
 | 
						||
        return self._reindex_output(result)._convert(datetime=True)
 | 
						||
 | 
						||
    def _iterate_column_groupbys(self, obj: DataFrame | Series):
 | 
						||
        for i, colname in enumerate(obj.columns):
 | 
						||
            yield colname, SeriesGroupBy(
 | 
						||
                obj.iloc[:, i],
 | 
						||
                selection=colname,
 | 
						||
                grouper=self.grouper,
 | 
						||
                exclusions=self.exclusions,
 | 
						||
                observed=self.observed,
 | 
						||
            )
 | 
						||
 | 
						||
    def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame:
 | 
						||
        from pandas.core.reshape.concat import concat
 | 
						||
 | 
						||
        columns = obj.columns
 | 
						||
        results = [
 | 
						||
            func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
 | 
						||
        ]
 | 
						||
 | 
						||
        if not len(results):
 | 
						||
            # concat would raise
 | 
						||
            return DataFrame([], columns=columns, index=self.grouper.result_index)
 | 
						||
        else:
 | 
						||
            return concat(results, keys=columns, axis=1)
 | 
						||
 | 
						||
    def nunique(self, dropna: bool = True) -> DataFrame:
 | 
						||
        """
 | 
						||
        Return DataFrame with counts of unique elements in each position.
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        dropna : bool, default True
 | 
						||
            Don't include NaN in the counts.
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        nunique: DataFrame
 | 
						||
 | 
						||
        Examples
 | 
						||
        --------
 | 
						||
        >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
 | 
						||
        ...                           'ham', 'ham'],
 | 
						||
        ...                    'value1': [1, 5, 5, 2, 5, 5],
 | 
						||
        ...                    'value2': list('abbaxy')})
 | 
						||
        >>> df
 | 
						||
             id  value1 value2
 | 
						||
        0  spam       1      a
 | 
						||
        1   egg       5      b
 | 
						||
        2   egg       5      b
 | 
						||
        3  spam       2      a
 | 
						||
        4   ham       5      x
 | 
						||
        5   ham       5      y
 | 
						||
 | 
						||
        >>> df.groupby('id').nunique()
 | 
						||
              value1  value2
 | 
						||
        id
 | 
						||
        egg        1       1
 | 
						||
        ham        1       2
 | 
						||
        spam       2       1
 | 
						||
 | 
						||
        Check for rows with the same id but conflicting values:
 | 
						||
 | 
						||
        >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
 | 
						||
             id  value1 value2
 | 
						||
        0  spam       1      a
 | 
						||
        3  spam       2      a
 | 
						||
        4   ham       5      x
 | 
						||
        5   ham       5      y
 | 
						||
        """
 | 
						||
 | 
						||
        if self.axis != 0:
 | 
						||
            # see test_groupby_crash_on_nunique
 | 
						||
            return self._python_agg_general(lambda sgb: sgb.nunique(dropna))
 | 
						||
 | 
						||
        obj = self._obj_with_exclusions
 | 
						||
        results = self._apply_to_column_groupbys(
 | 
						||
            lambda sgb: sgb.nunique(dropna), obj=obj
 | 
						||
        )
 | 
						||
 | 
						||
        if not self.as_index:
 | 
						||
            results.index = Index(range(len(results)))
 | 
						||
            self._insert_inaxis_grouper_inplace(results)
 | 
						||
 | 
						||
        return results
 | 
						||
 | 
						||
    @Appender(DataFrame.idxmax.__doc__)
 | 
						||
    def idxmax(self, axis=0, skipna: bool = True):
 | 
						||
        axis = DataFrame._get_axis_number(axis)
 | 
						||
        numeric_only = None if axis == 0 else False
 | 
						||
 | 
						||
        def func(df):
 | 
						||
            # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
 | 
						||
            res = df._reduce(
 | 
						||
                nanops.nanargmax,
 | 
						||
                "argmax",
 | 
						||
                axis=axis,
 | 
						||
                skipna=skipna,
 | 
						||
                numeric_only=numeric_only,
 | 
						||
            )
 | 
						||
            indices = res._values
 | 
						||
            index = df._get_axis(axis)
 | 
						||
            result = [index[i] if i >= 0 else np.nan for i in indices]
 | 
						||
            return df._constructor_sliced(result, index=res.index)
 | 
						||
 | 
						||
        func.__name__ = "idxmax"
 | 
						||
        return self._python_apply_general(func, self._obj_with_exclusions)
 | 
						||
 | 
						||
    @Appender(DataFrame.idxmin.__doc__)
 | 
						||
    def idxmin(self, axis=0, skipna: bool = True):
 | 
						||
        axis = DataFrame._get_axis_number(axis)
 | 
						||
        numeric_only = None if axis == 0 else False
 | 
						||
 | 
						||
        def func(df):
 | 
						||
            # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
 | 
						||
            res = df._reduce(
 | 
						||
                nanops.nanargmin,
 | 
						||
                "argmin",
 | 
						||
                axis=axis,
 | 
						||
                skipna=skipna,
 | 
						||
                numeric_only=numeric_only,
 | 
						||
            )
 | 
						||
            indices = res._values
 | 
						||
            index = df._get_axis(axis)
 | 
						||
            result = [index[i] if i >= 0 else np.nan for i in indices]
 | 
						||
            return df._constructor_sliced(result, index=res.index)
 | 
						||
 | 
						||
        func.__name__ = "idxmin"
 | 
						||
        return self._python_apply_general(func, self._obj_with_exclusions)
 | 
						||
 | 
						||
    boxplot = boxplot_frame_groupby
 | 
						||
 | 
						||
    def value_counts(
 | 
						||
        self,
 | 
						||
        subset: Sequence[Hashable] | None = None,
 | 
						||
        normalize: bool = False,
 | 
						||
        sort: bool = True,
 | 
						||
        ascending: bool = False,
 | 
						||
        dropna: bool = True,
 | 
						||
    ) -> DataFrame | Series:
 | 
						||
        """
 | 
						||
        Return a Series or DataFrame containing counts of unique rows.
 | 
						||
 | 
						||
        .. versionadded:: 1.4.0
 | 
						||
 | 
						||
        Parameters
 | 
						||
        ----------
 | 
						||
        subset : list-like, optional
 | 
						||
            Columns to use when counting unique combinations.
 | 
						||
        normalize : bool, default False
 | 
						||
            Return proportions rather than frequencies.
 | 
						||
        sort : bool, default True
 | 
						||
            Sort by frequencies.
 | 
						||
        ascending : bool, default False
 | 
						||
            Sort in ascending order.
 | 
						||
        dropna : bool, default True
 | 
						||
            Don’t include counts of rows that contain NA values.
 | 
						||
 | 
						||
        Returns
 | 
						||
        -------
 | 
						||
        Series or DataFrame
 | 
						||
            Series if the groupby as_index is True, otherwise DataFrame.
 | 
						||
 | 
						||
        See Also
 | 
						||
        --------
 | 
						||
        Series.value_counts: Equivalent method on Series.
 | 
						||
        DataFrame.value_counts: Equivalent method on DataFrame.
 | 
						||
        SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
 | 
						||
 | 
						||
        Notes
 | 
						||
        -----
 | 
						||
        - If the groupby as_index is True then the returned Series will have a
 | 
						||
          MultiIndex with one level per input column.
 | 
						||
        - If the groupby as_index is False then the returned DataFrame will have an
 | 
						||
          additional column with the value_counts. The column is labelled 'count' or
 | 
						||
          'proportion', depending on the ``normalize`` parameter.
 | 
						||
 | 
						||
        By default, rows that contain any NA values are omitted from
 | 
						||
        the result.
 | 
						||
 | 
						||
        By default, the result will be in descending order so that the
 | 
						||
        first element of each group is the most frequently-occurring row.
 | 
						||
 | 
						||
        Examples
 | 
						||
        --------
 | 
						||
        >>> df = pd.DataFrame({
 | 
						||
        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
 | 
						||
        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
 | 
						||
        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
 | 
						||
        ... })
 | 
						||
 | 
						||
        >>> df
 | 
						||
            gender 	education 	country
 | 
						||
        0 	male 	low 	    US
 | 
						||
        1 	male 	medium 	    FR
 | 
						||
        2 	female 	high 	    US
 | 
						||
        3 	male 	low 	    FR
 | 
						||
        4 	female 	high 	    FR
 | 
						||
        5 	male 	low 	    FR
 | 
						||
 | 
						||
        >>> df.groupby('gender').value_counts()
 | 
						||
        gender  education  country
 | 
						||
        female  high       FR         1
 | 
						||
                           US         1
 | 
						||
        male    low        FR         2
 | 
						||
                           US         1
 | 
						||
                medium     FR         1
 | 
						||
        dtype: int64
 | 
						||
 | 
						||
        >>> df.groupby('gender').value_counts(ascending=True)
 | 
						||
        gender  education  country
 | 
						||
        female  high       FR         1
 | 
						||
                           US         1
 | 
						||
        male    low        US         1
 | 
						||
                medium     FR         1
 | 
						||
                low        FR         2
 | 
						||
        dtype: int64
 | 
						||
 | 
						||
        >>> df.groupby('gender').value_counts(normalize=True)
 | 
						||
        gender  education  country
 | 
						||
        female  high       FR         0.50
 | 
						||
                           US         0.50
 | 
						||
        male    low        FR         0.50
 | 
						||
                           US         0.25
 | 
						||
                medium     FR         0.25
 | 
						||
        dtype: float64
 | 
						||
 | 
						||
        >>> df.groupby('gender', as_index=False).value_counts()
 | 
						||
           gender education country  count
 | 
						||
        0  female      high      FR      1
 | 
						||
        1  female      high      US      1
 | 
						||
        2    male       low      FR      2
 | 
						||
        3    male       low      US      1
 | 
						||
        4    male    medium      FR      1
 | 
						||
 | 
						||
        >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
 | 
						||
           gender education country  proportion
 | 
						||
        0  female      high      FR        0.50
 | 
						||
        1  female      high      US        0.50
 | 
						||
        2    male       low      FR        0.50
 | 
						||
        3    male       low      US        0.25
 | 
						||
        4    male    medium      FR        0.25
 | 
						||
        """
 | 
						||
        if self.axis == 1:
 | 
						||
            raise NotImplementedError(
 | 
						||
                "DataFrameGroupBy.value_counts only handles axis=0"
 | 
						||
            )
 | 
						||
 | 
						||
        with self._group_selection_context():
 | 
						||
            df = self.obj
 | 
						||
 | 
						||
            in_axis_names = {
 | 
						||
                grouping.name for grouping in self.grouper.groupings if grouping.in_axis
 | 
						||
            }
 | 
						||
            if isinstance(self._selected_obj, Series):
 | 
						||
                name = self._selected_obj.name
 | 
						||
                keys = [] if name in in_axis_names else [self._selected_obj]
 | 
						||
            else:
 | 
						||
                unique_cols = set(self._selected_obj.columns)
 | 
						||
                if subset is not None:
 | 
						||
                    subsetted = set(subset)
 | 
						||
                    clashing = subsetted & set(in_axis_names)
 | 
						||
                    if clashing:
 | 
						||
                        raise ValueError(
 | 
						||
                            f"Keys {clashing} in subset cannot be in "
 | 
						||
                            "the groupby column keys."
 | 
						||
                        )
 | 
						||
                    doesnt_exist = subsetted - unique_cols
 | 
						||
                    if doesnt_exist:
 | 
						||
                        raise ValueError(
 | 
						||
                            f"Keys {doesnt_exist} in subset do not "
 | 
						||
                            f"exist in the DataFrame."
 | 
						||
                        )
 | 
						||
                else:
 | 
						||
                    subsetted = unique_cols
 | 
						||
 | 
						||
                keys = [
 | 
						||
                    # Can't use .values because the column label needs to be preserved
 | 
						||
                    self._selected_obj.iloc[:, idx]
 | 
						||
                    for idx, name in enumerate(self._selected_obj.columns)
 | 
						||
                    if name not in in_axis_names and name in subsetted
 | 
						||
                ]
 | 
						||
 | 
						||
            groupings = list(self.grouper.groupings)
 | 
						||
            for key in keys:
 | 
						||
                grouper, _, _ = get_grouper(
 | 
						||
                    df,
 | 
						||
                    key=key,
 | 
						||
                    axis=self.axis,
 | 
						||
                    sort=self.sort,
 | 
						||
                    dropna=dropna,
 | 
						||
                )
 | 
						||
                groupings += list(grouper.groupings)
 | 
						||
 | 
						||
            # Take the size of the overall columns
 | 
						||
            gb = df.groupby(
 | 
						||
                groupings,
 | 
						||
                sort=self.sort,
 | 
						||
                observed=self.observed,
 | 
						||
                dropna=self.dropna,
 | 
						||
            )
 | 
						||
            result = cast(Series, gb.size())
 | 
						||
 | 
						||
            if normalize:
 | 
						||
                # Normalize the results by dividing by the original group sizes.
 | 
						||
                # We are guaranteed to have the first N levels be the
 | 
						||
                # user-requested grouping.
 | 
						||
                levels = list(range(len(self.grouper.groupings), result.index.nlevels))
 | 
						||
                indexed_group_size = result.groupby(
 | 
						||
                    result.index.droplevel(levels),
 | 
						||
                    sort=self.sort,
 | 
						||
                    observed=self.observed,
 | 
						||
                    dropna=self.dropna,
 | 
						||
                ).transform("sum")
 | 
						||
 | 
						||
                result /= indexed_group_size
 | 
						||
 | 
						||
            if sort:
 | 
						||
                # Sort the values and then resort by the main grouping
 | 
						||
                index_level = range(len(self.grouper.groupings))
 | 
						||
                result = result.sort_values(ascending=ascending).sort_index(
 | 
						||
                    level=index_level, sort_remaining=False
 | 
						||
                )
 | 
						||
 | 
						||
            if not self.as_index:
 | 
						||
                # Convert to frame
 | 
						||
                result = result.reset_index(name="proportion" if normalize else "count")
 | 
						||
            return result.__finalize__(self.obj, method="value_counts")
 | 
						||
 | 
						||
 | 
						||
def _wrap_transform_general_frame(
 | 
						||
    obj: DataFrame, group: DataFrame, res: DataFrame | Series
 | 
						||
) -> DataFrame:
 | 
						||
    from pandas import concat
 | 
						||
 | 
						||
    if isinstance(res, Series):
 | 
						||
        # we need to broadcast across the
 | 
						||
        # other dimension; this will preserve dtypes
 | 
						||
        # GH14457
 | 
						||
        if res.index.is_(obj.index):
 | 
						||
            res_frame = concat([res] * len(group.columns), axis=1)
 | 
						||
            res_frame.columns = group.columns
 | 
						||
            res_frame.index = group.index
 | 
						||
        else:
 | 
						||
            res_frame = obj._constructor(
 | 
						||
                np.concatenate([res.values] * len(group.index)).reshape(group.shape),
 | 
						||
                columns=group.columns,
 | 
						||
                index=group.index,
 | 
						||
            )
 | 
						||
        assert isinstance(res_frame, DataFrame)
 | 
						||
        return res_frame
 | 
						||
    else:
 | 
						||
        return res
 |