843 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			843 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import annotations
 | 
						|
 | 
						|
from typing import (
 | 
						|
    TYPE_CHECKING,
 | 
						|
    Callable,
 | 
						|
    Hashable,
 | 
						|
    Sequence,
 | 
						|
    cast,
 | 
						|
)
 | 
						|
 | 
						|
import numpy as np
 | 
						|
 | 
						|
from pandas._typing import (
 | 
						|
    AggFuncType,
 | 
						|
    AggFuncTypeBase,
 | 
						|
    AggFuncTypeDict,
 | 
						|
    IndexLabel,
 | 
						|
)
 | 
						|
from pandas.util._decorators import (
 | 
						|
    Appender,
 | 
						|
    Substitution,
 | 
						|
)
 | 
						|
 | 
						|
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 | 
						|
from pandas.core.dtypes.common import (
 | 
						|
    is_integer_dtype,
 | 
						|
    is_list_like,
 | 
						|
    is_nested_list_like,
 | 
						|
    is_scalar,
 | 
						|
)
 | 
						|
from pandas.core.dtypes.generic import (
 | 
						|
    ABCDataFrame,
 | 
						|
    ABCSeries,
 | 
						|
)
 | 
						|
 | 
						|
import pandas.core.common as com
 | 
						|
from pandas.core.frame import _shared_docs
 | 
						|
from pandas.core.groupby import Grouper
 | 
						|
from pandas.core.indexes.api import (
 | 
						|
    Index,
 | 
						|
    MultiIndex,
 | 
						|
    get_objs_combined_axis,
 | 
						|
)
 | 
						|
from pandas.core.reshape.concat import concat
 | 
						|
from pandas.core.reshape.util import cartesian_product
 | 
						|
from pandas.core.series import Series
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from pandas import DataFrame
 | 
						|
 | 
						|
 | 
						|
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
 | 
						|
# _shared_docs['pivot_table'] will not yet exist.  TODO: Fix this dependency
 | 
						|
@Substitution("\ndata : DataFrame")
 | 
						|
@Appender(_shared_docs["pivot_table"], indents=1)
 | 
						|
def pivot_table(
 | 
						|
    data: DataFrame,
 | 
						|
    values=None,
 | 
						|
    index=None,
 | 
						|
    columns=None,
 | 
						|
    aggfunc: AggFuncType = "mean",
 | 
						|
    fill_value=None,
 | 
						|
    margins: bool = False,
 | 
						|
    dropna: bool = True,
 | 
						|
    margins_name: str = "All",
 | 
						|
    observed: bool = False,
 | 
						|
    sort: bool = True,
 | 
						|
) -> DataFrame:
 | 
						|
    index = _convert_by(index)
 | 
						|
    columns = _convert_by(columns)
 | 
						|
 | 
						|
    if isinstance(aggfunc, list):
 | 
						|
        pieces: list[DataFrame] = []
 | 
						|
        keys = []
 | 
						|
        for func in aggfunc:
 | 
						|
            _table = __internal_pivot_table(
 | 
						|
                data,
 | 
						|
                values=values,
 | 
						|
                index=index,
 | 
						|
                columns=columns,
 | 
						|
                fill_value=fill_value,
 | 
						|
                aggfunc=func,
 | 
						|
                margins=margins,
 | 
						|
                dropna=dropna,
 | 
						|
                margins_name=margins_name,
 | 
						|
                observed=observed,
 | 
						|
                sort=sort,
 | 
						|
            )
 | 
						|
            pieces.append(_table)
 | 
						|
            keys.append(getattr(func, "__name__", func))
 | 
						|
 | 
						|
        table = concat(pieces, keys=keys, axis=1)
 | 
						|
        return table.__finalize__(data, method="pivot_table")
 | 
						|
 | 
						|
    table = __internal_pivot_table(
 | 
						|
        data,
 | 
						|
        values,
 | 
						|
        index,
 | 
						|
        columns,
 | 
						|
        aggfunc,
 | 
						|
        fill_value,
 | 
						|
        margins,
 | 
						|
        dropna,
 | 
						|
        margins_name,
 | 
						|
        observed,
 | 
						|
        sort,
 | 
						|
    )
 | 
						|
    return table.__finalize__(data, method="pivot_table")
 | 
						|
 | 
						|
 | 
						|
def __internal_pivot_table(
 | 
						|
    data: DataFrame,
 | 
						|
    values,
 | 
						|
    index,
 | 
						|
    columns,
 | 
						|
    aggfunc: AggFuncTypeBase | AggFuncTypeDict,
 | 
						|
    fill_value,
 | 
						|
    margins: bool,
 | 
						|
    dropna: bool,
 | 
						|
    margins_name: str,
 | 
						|
    observed: bool,
 | 
						|
    sort: bool,
 | 
						|
) -> DataFrame:
 | 
						|
    """
 | 
						|
    Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
 | 
						|
    """
 | 
						|
    keys = index + columns
 | 
						|
 | 
						|
    values_passed = values is not None
 | 
						|
    if values_passed:
 | 
						|
        if is_list_like(values):
 | 
						|
            values_multi = True
 | 
						|
            values = list(values)
 | 
						|
        else:
 | 
						|
            values_multi = False
 | 
						|
            values = [values]
 | 
						|
 | 
						|
        # GH14938 Make sure value labels are in data
 | 
						|
        for i in values:
 | 
						|
            if i not in data:
 | 
						|
                raise KeyError(i)
 | 
						|
 | 
						|
        to_filter = []
 | 
						|
        for x in keys + values:
 | 
						|
            if isinstance(x, Grouper):
 | 
						|
                x = x.key
 | 
						|
            try:
 | 
						|
                if x in data:
 | 
						|
                    to_filter.append(x)
 | 
						|
            except TypeError:
 | 
						|
                pass
 | 
						|
        if len(to_filter) < len(data.columns):
 | 
						|
            data = data[to_filter]
 | 
						|
 | 
						|
    else:
 | 
						|
        values = data.columns
 | 
						|
        for key in keys:
 | 
						|
            try:
 | 
						|
                values = values.drop(key)
 | 
						|
            except (TypeError, ValueError, KeyError):
 | 
						|
                pass
 | 
						|
        values = list(values)
 | 
						|
 | 
						|
    grouped = data.groupby(keys, observed=observed, sort=sort)
 | 
						|
    agged = grouped.agg(aggfunc)
 | 
						|
    if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
 | 
						|
        agged = agged.dropna(how="all")
 | 
						|
 | 
						|
        # gh-21133
 | 
						|
        # we want to down cast if
 | 
						|
        # the original values are ints
 | 
						|
        # as we grouped with a NaN value
 | 
						|
        # and then dropped, coercing to floats
 | 
						|
        for v in values:
 | 
						|
            if (
 | 
						|
                v in data
 | 
						|
                and is_integer_dtype(data[v])
 | 
						|
                and v in agged
 | 
						|
                and not is_integer_dtype(agged[v])
 | 
						|
            ):
 | 
						|
                if not isinstance(agged[v], ABCDataFrame):
 | 
						|
                    # exclude DataFrame case bc maybe_downcast_to_dtype expects
 | 
						|
                    #  ArrayLike
 | 
						|
                    # e.g. test_pivot_table_multiindex_columns_doctest_case
 | 
						|
                    #  agged.columns is a MultiIndex and 'v' is indexing only
 | 
						|
                    #  on its first level.
 | 
						|
                    agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
 | 
						|
 | 
						|
    table = agged
 | 
						|
 | 
						|
    # GH17038, this check should only happen if index is defined (not None)
 | 
						|
    if table.index.nlevels > 1 and index:
 | 
						|
        # Related GH #17123
 | 
						|
        # If index_names are integers, determine whether the integers refer
 | 
						|
        # to the level position or name.
 | 
						|
        index_names = agged.index.names[: len(index)]
 | 
						|
        to_unstack = []
 | 
						|
        for i in range(len(index), len(keys)):
 | 
						|
            name = agged.index.names[i]
 | 
						|
            if name is None or name in index_names:
 | 
						|
                to_unstack.append(i)
 | 
						|
            else:
 | 
						|
                to_unstack.append(name)
 | 
						|
        table = agged.unstack(to_unstack)
 | 
						|
 | 
						|
    if not dropna:
 | 
						|
        if isinstance(table.index, MultiIndex):
 | 
						|
            m = MultiIndex.from_arrays(
 | 
						|
                cartesian_product(table.index.levels), names=table.index.names
 | 
						|
            )
 | 
						|
            table = table.reindex(m, axis=0)
 | 
						|
 | 
						|
        if isinstance(table.columns, MultiIndex):
 | 
						|
            m = MultiIndex.from_arrays(
 | 
						|
                cartesian_product(table.columns.levels), names=table.columns.names
 | 
						|
            )
 | 
						|
            table = table.reindex(m, axis=1)
 | 
						|
 | 
						|
    if isinstance(table, ABCDataFrame):
 | 
						|
        table = table.sort_index(axis=1)
 | 
						|
 | 
						|
    if fill_value is not None:
 | 
						|
        table = table.fillna(fill_value, downcast="infer")
 | 
						|
 | 
						|
    if margins:
 | 
						|
        if dropna:
 | 
						|
            data = data[data.notna().all(axis=1)]
 | 
						|
        table = _add_margins(
 | 
						|
            table,
 | 
						|
            data,
 | 
						|
            values,
 | 
						|
            rows=index,
 | 
						|
            cols=columns,
 | 
						|
            aggfunc=aggfunc,
 | 
						|
            observed=dropna,
 | 
						|
            margins_name=margins_name,
 | 
						|
            fill_value=fill_value,
 | 
						|
        )
 | 
						|
 | 
						|
    # discard the top level
 | 
						|
    if values_passed and not values_multi and table.columns.nlevels > 1:
 | 
						|
        table = table.droplevel(0, axis=1)
 | 
						|
    if len(index) == 0 and len(columns) > 0:
 | 
						|
        table = table.T
 | 
						|
 | 
						|
    # GH 15193 Make sure empty columns are removed if dropna=True
 | 
						|
    if isinstance(table, ABCDataFrame) and dropna:
 | 
						|
        table = table.dropna(how="all", axis=1)
 | 
						|
 | 
						|
    return table
 | 
						|
 | 
						|
 | 
						|
def _add_margins(
 | 
						|
    table: DataFrame | Series,
 | 
						|
    data: DataFrame,
 | 
						|
    values,
 | 
						|
    rows,
 | 
						|
    cols,
 | 
						|
    aggfunc,
 | 
						|
    observed=None,
 | 
						|
    margins_name: str = "All",
 | 
						|
    fill_value=None,
 | 
						|
):
 | 
						|
    if not isinstance(margins_name, str):
 | 
						|
        raise ValueError("margins_name argument must be a string")
 | 
						|
 | 
						|
    msg = f'Conflicting name "{margins_name}" in margins'
 | 
						|
    for level in table.index.names:
 | 
						|
        if margins_name in table.index.get_level_values(level):
 | 
						|
            raise ValueError(msg)
 | 
						|
 | 
						|
    grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
 | 
						|
 | 
						|
    if table.ndim == 2:
 | 
						|
        # i.e. DataFrame
 | 
						|
        for level in table.columns.names[1:]:
 | 
						|
            if margins_name in table.columns.get_level_values(level):
 | 
						|
                raise ValueError(msg)
 | 
						|
 | 
						|
    key: str | tuple[str, ...]
 | 
						|
    if len(rows) > 1:
 | 
						|
        key = (margins_name,) + ("",) * (len(rows) - 1)
 | 
						|
    else:
 | 
						|
        key = margins_name
 | 
						|
 | 
						|
    if not values and isinstance(table, ABCSeries):
 | 
						|
        # If there are no values and the table is a series, then there is only
 | 
						|
        # one column in the data. Compute grand margin and return it.
 | 
						|
        return table._append(Series({key: grand_margin[margins_name]}))
 | 
						|
 | 
						|
    elif values:
 | 
						|
        marginal_result_set = _generate_marginal_results(
 | 
						|
            table, data, values, rows, cols, aggfunc, observed, margins_name
 | 
						|
        )
 | 
						|
        if not isinstance(marginal_result_set, tuple):
 | 
						|
            return marginal_result_set
 | 
						|
        result, margin_keys, row_margin = marginal_result_set
 | 
						|
    else:
 | 
						|
        # no values, and table is a DataFrame
 | 
						|
        assert isinstance(table, ABCDataFrame)
 | 
						|
        marginal_result_set = _generate_marginal_results_without_values(
 | 
						|
            table, data, rows, cols, aggfunc, observed, margins_name
 | 
						|
        )
 | 
						|
        if not isinstance(marginal_result_set, tuple):
 | 
						|
            return marginal_result_set
 | 
						|
        result, margin_keys, row_margin = marginal_result_set
 | 
						|
 | 
						|
    row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
 | 
						|
    # populate grand margin
 | 
						|
    for k in margin_keys:
 | 
						|
        if isinstance(k, str):
 | 
						|
            row_margin[k] = grand_margin[k]
 | 
						|
        else:
 | 
						|
            row_margin[k] = grand_margin[k[0]]
 | 
						|
 | 
						|
    from pandas import DataFrame
 | 
						|
 | 
						|
    margin_dummy = DataFrame(row_margin, columns=[key]).T
 | 
						|
 | 
						|
    row_names = result.index.names
 | 
						|
    # check the result column and leave floats
 | 
						|
    for dtype in set(result.dtypes):
 | 
						|
        cols = result.select_dtypes([dtype]).columns
 | 
						|
        margin_dummy[cols] = margin_dummy[cols].apply(
 | 
						|
            maybe_downcast_to_dtype, args=(dtype,)
 | 
						|
        )
 | 
						|
    result = result._append(margin_dummy)
 | 
						|
    result.index.names = row_names
 | 
						|
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def _compute_grand_margin(data: DataFrame, values, aggfunc, margins_name: str = "All"):
 | 
						|
 | 
						|
    if values:
 | 
						|
        grand_margin = {}
 | 
						|
        for k, v in data[values].items():
 | 
						|
            try:
 | 
						|
                if isinstance(aggfunc, str):
 | 
						|
                    grand_margin[k] = getattr(v, aggfunc)()
 | 
						|
                elif isinstance(aggfunc, dict):
 | 
						|
                    if isinstance(aggfunc[k], str):
 | 
						|
                        grand_margin[k] = getattr(v, aggfunc[k])()
 | 
						|
                    else:
 | 
						|
                        grand_margin[k] = aggfunc[k](v)
 | 
						|
                else:
 | 
						|
                    grand_margin[k] = aggfunc(v)
 | 
						|
            except TypeError:
 | 
						|
                pass
 | 
						|
        return grand_margin
 | 
						|
    else:
 | 
						|
        return {margins_name: aggfunc(data.index)}
 | 
						|
 | 
						|
 | 
						|
def _generate_marginal_results(
 | 
						|
    table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All"
 | 
						|
):
 | 
						|
    if len(cols) > 0:
 | 
						|
        # need to "interleave" the margins
 | 
						|
        table_pieces = []
 | 
						|
        margin_keys = []
 | 
						|
 | 
						|
        def _all_key(key):
 | 
						|
            return (key, margins_name) + ("",) * (len(cols) - 1)
 | 
						|
 | 
						|
        if len(rows) > 0:
 | 
						|
            margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
 | 
						|
            cat_axis = 1
 | 
						|
 | 
						|
            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
 | 
						|
                all_key = _all_key(key)
 | 
						|
 | 
						|
                # we are going to mutate this, so need to copy!
 | 
						|
                piece = piece.copy()
 | 
						|
                piece[all_key] = margin[key]
 | 
						|
 | 
						|
                table_pieces.append(piece)
 | 
						|
                margin_keys.append(all_key)
 | 
						|
        else:
 | 
						|
            from pandas import DataFrame
 | 
						|
 | 
						|
            cat_axis = 0
 | 
						|
            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
 | 
						|
                if len(cols) > 1:
 | 
						|
                    all_key = _all_key(key)
 | 
						|
                else:
 | 
						|
                    all_key = margins_name
 | 
						|
                table_pieces.append(piece)
 | 
						|
                # GH31016 this is to calculate margin for each group, and assign
 | 
						|
                # corresponded key as index
 | 
						|
                transformed_piece = DataFrame(piece.apply(aggfunc)).T
 | 
						|
                transformed_piece.index = Index([all_key], name=piece.index.name)
 | 
						|
 | 
						|
                # append piece for margin into table_piece
 | 
						|
                table_pieces.append(transformed_piece)
 | 
						|
                margin_keys.append(all_key)
 | 
						|
 | 
						|
        result = concat(table_pieces, axis=cat_axis)
 | 
						|
 | 
						|
        if len(rows) == 0:
 | 
						|
            return result
 | 
						|
    else:
 | 
						|
        result = table
 | 
						|
        margin_keys = table.columns
 | 
						|
 | 
						|
    if len(cols) > 0:
 | 
						|
        row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
 | 
						|
        row_margin = row_margin.stack()
 | 
						|
 | 
						|
        # slight hack
 | 
						|
        new_order = [len(cols)] + list(range(len(cols)))
 | 
						|
        row_margin.index = row_margin.index.reorder_levels(new_order)
 | 
						|
    else:
 | 
						|
        row_margin = Series(np.nan, index=result.columns)
 | 
						|
 | 
						|
    return result, margin_keys, row_margin
 | 
						|
 | 
						|
 | 
						|
def _generate_marginal_results_without_values(
 | 
						|
    table: DataFrame, data, rows, cols, aggfunc, observed, margins_name: str = "All"
 | 
						|
):
 | 
						|
    if len(cols) > 0:
 | 
						|
        # need to "interleave" the margins
 | 
						|
        margin_keys: list | Index = []
 | 
						|
 | 
						|
        def _all_key():
 | 
						|
            if len(cols) == 1:
 | 
						|
                return margins_name
 | 
						|
            return (margins_name,) + ("",) * (len(cols) - 1)
 | 
						|
 | 
						|
        if len(rows) > 0:
 | 
						|
            margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
 | 
						|
            all_key = _all_key()
 | 
						|
            table[all_key] = margin
 | 
						|
            result = table
 | 
						|
            margin_keys.append(all_key)
 | 
						|
 | 
						|
        else:
 | 
						|
            margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
 | 
						|
            all_key = _all_key()
 | 
						|
            table[all_key] = margin
 | 
						|
            result = table
 | 
						|
            margin_keys.append(all_key)
 | 
						|
            return result
 | 
						|
    else:
 | 
						|
        result = table
 | 
						|
        margin_keys = table.columns
 | 
						|
 | 
						|
    if len(cols):
 | 
						|
        row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
 | 
						|
    else:
 | 
						|
        row_margin = Series(np.nan, index=result.columns)
 | 
						|
 | 
						|
    return result, margin_keys, row_margin
 | 
						|
 | 
						|
 | 
						|
def _convert_by(by):
 | 
						|
    if by is None:
 | 
						|
        by = []
 | 
						|
    elif (
 | 
						|
        is_scalar(by)
 | 
						|
        or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
 | 
						|
        or callable(by)
 | 
						|
    ):
 | 
						|
        by = [by]
 | 
						|
    else:
 | 
						|
        by = list(by)
 | 
						|
    return by
 | 
						|
 | 
						|
 | 
						|
@Substitution("\ndata : DataFrame")
 | 
						|
@Appender(_shared_docs["pivot"], indents=1)
 | 
						|
def pivot(
 | 
						|
    data: DataFrame,
 | 
						|
    index: IndexLabel | None = None,
 | 
						|
    columns: IndexLabel | None = None,
 | 
						|
    values: IndexLabel | None = None,
 | 
						|
) -> DataFrame:
 | 
						|
    if columns is None:
 | 
						|
        raise TypeError("pivot() missing 1 required argument: 'columns'")
 | 
						|
 | 
						|
    columns_listlike = com.convert_to_list_like(columns)
 | 
						|
 | 
						|
    if values is None:
 | 
						|
        if index is not None:
 | 
						|
            cols = com.convert_to_list_like(index)
 | 
						|
        else:
 | 
						|
            cols = []
 | 
						|
 | 
						|
        append = index is None
 | 
						|
        # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
 | 
						|
        # error: Unsupported left operand type for + ("ExtensionArray")
 | 
						|
        indexed = data.set_index(
 | 
						|
            cols + columns_listlike, append=append  # type: ignore[operator]
 | 
						|
        )
 | 
						|
    else:
 | 
						|
        if index is None:
 | 
						|
            if isinstance(data.index, MultiIndex):
 | 
						|
                # GH 23955
 | 
						|
                index_list = [
 | 
						|
                    data.index.get_level_values(i) for i in range(data.index.nlevels)
 | 
						|
                ]
 | 
						|
            else:
 | 
						|
                index_list = [Series(data.index, name=data.index.name)]
 | 
						|
        else:
 | 
						|
            index_list = [data[idx] for idx in com.convert_to_list_like(index)]
 | 
						|
 | 
						|
        data_columns = [data[col] for col in columns_listlike]
 | 
						|
        index_list.extend(data_columns)
 | 
						|
        multiindex = MultiIndex.from_arrays(index_list)
 | 
						|
 | 
						|
        if is_list_like(values) and not isinstance(values, tuple):
 | 
						|
            # Exclude tuple because it is seen as a single column name
 | 
						|
            values = cast(Sequence[Hashable], values)
 | 
						|
            indexed = data._constructor(
 | 
						|
                data[values]._values, index=multiindex, columns=values
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            indexed = data._constructor_sliced(data[values]._values, index=multiindex)
 | 
						|
    return indexed.unstack(columns_listlike)
 | 
						|
 | 
						|
 | 
						|
def crosstab(
 | 
						|
    index,
 | 
						|
    columns,
 | 
						|
    values=None,
 | 
						|
    rownames=None,
 | 
						|
    colnames=None,
 | 
						|
    aggfunc=None,
 | 
						|
    margins: bool = False,
 | 
						|
    margins_name: str = "All",
 | 
						|
    dropna: bool = True,
 | 
						|
    normalize=False,
 | 
						|
) -> DataFrame:
 | 
						|
    """
 | 
						|
    Compute a simple cross tabulation of two (or more) factors. By default
 | 
						|
    computes a frequency table of the factors unless an array of values and an
 | 
						|
    aggregation function are passed.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    index : array-like, Series, or list of arrays/Series
 | 
						|
        Values to group by in the rows.
 | 
						|
    columns : array-like, Series, or list of arrays/Series
 | 
						|
        Values to group by in the columns.
 | 
						|
    values : array-like, optional
 | 
						|
        Array of values to aggregate according to the factors.
 | 
						|
        Requires `aggfunc` be specified.
 | 
						|
    rownames : sequence, default None
 | 
						|
        If passed, must match number of row arrays passed.
 | 
						|
    colnames : sequence, default None
 | 
						|
        If passed, must match number of column arrays passed.
 | 
						|
    aggfunc : function, optional
 | 
						|
        If specified, requires `values` be specified as well.
 | 
						|
    margins : bool, default False
 | 
						|
        Add row/column margins (subtotals).
 | 
						|
    margins_name : str, default 'All'
 | 
						|
        Name of the row/column that will contain the totals
 | 
						|
        when margins is True.
 | 
						|
    dropna : bool, default True
 | 
						|
        Do not include columns whose entries are all NaN.
 | 
						|
    normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
 | 
						|
        Normalize by dividing all values by the sum of values.
 | 
						|
 | 
						|
        - If passed 'all' or `True`, will normalize over all values.
 | 
						|
        - If passed 'index' will normalize over each row.
 | 
						|
        - If passed 'columns' will normalize over each column.
 | 
						|
        - If margins is `True`, will also normalize margin values.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    DataFrame
 | 
						|
        Cross tabulation of the data.
 | 
						|
 | 
						|
    See Also
 | 
						|
    --------
 | 
						|
    DataFrame.pivot : Reshape data based on column values.
 | 
						|
    pivot_table : Create a pivot table as a DataFrame.
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    Any Series passed will have their name attributes used unless row or column
 | 
						|
    names for the cross-tabulation are specified.
 | 
						|
 | 
						|
    Any input passed containing Categorical data will have **all** of its
 | 
						|
    categories included in the cross-tabulation, even if the actual data does
 | 
						|
    not contain any instances of a particular category.
 | 
						|
 | 
						|
    In the event that there aren't overlapping indexes an empty DataFrame will
 | 
						|
    be returned.
 | 
						|
 | 
						|
    Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
 | 
						|
    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
 | 
						|
    >>> b = np.array(["one", "one", "one", "two", "one", "one",
 | 
						|
    ...               "one", "two", "two", "two", "one"], dtype=object)
 | 
						|
    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
 | 
						|
    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
 | 
						|
    ...              dtype=object)
 | 
						|
    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
 | 
						|
    b   one        two
 | 
						|
    c   dull shiny dull shiny
 | 
						|
    a
 | 
						|
    bar    1     2    1     0
 | 
						|
    foo    2     2    1     2
 | 
						|
 | 
						|
    Here 'c' and 'f' are not represented in the data and will not be
 | 
						|
    shown in the output because dropna is True by default. Set
 | 
						|
    dropna=False to preserve categories with no data.
 | 
						|
 | 
						|
    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
 | 
						|
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
 | 
						|
    >>> pd.crosstab(foo, bar)
 | 
						|
    col_0  d  e
 | 
						|
    row_0
 | 
						|
    a      1  0
 | 
						|
    b      0  1
 | 
						|
    >>> pd.crosstab(foo, bar, dropna=False)
 | 
						|
    col_0  d  e  f
 | 
						|
    row_0
 | 
						|
    a      1  0  0
 | 
						|
    b      0  1  0
 | 
						|
    c      0  0  0
 | 
						|
    """
 | 
						|
    if values is None and aggfunc is not None:
 | 
						|
        raise ValueError("aggfunc cannot be used without values.")
 | 
						|
 | 
						|
    if values is not None and aggfunc is None:
 | 
						|
        raise ValueError("values cannot be used without an aggfunc.")
 | 
						|
 | 
						|
    if not is_nested_list_like(index):
 | 
						|
        index = [index]
 | 
						|
    if not is_nested_list_like(columns):
 | 
						|
        columns = [columns]
 | 
						|
 | 
						|
    common_idx = None
 | 
						|
    pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
 | 
						|
    if pass_objs:
 | 
						|
        common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
 | 
						|
 | 
						|
    rownames = _get_names(index, rownames, prefix="row")
 | 
						|
    colnames = _get_names(columns, colnames, prefix="col")
 | 
						|
 | 
						|
    # duplicate names mapped to unique names for pivot op
 | 
						|
    (
 | 
						|
        rownames_mapper,
 | 
						|
        unique_rownames,
 | 
						|
        colnames_mapper,
 | 
						|
        unique_colnames,
 | 
						|
    ) = _build_names_mapper(rownames, colnames)
 | 
						|
 | 
						|
    from pandas import DataFrame
 | 
						|
 | 
						|
    data = {
 | 
						|
        **dict(zip(unique_rownames, index)),
 | 
						|
        **dict(zip(unique_colnames, columns)),
 | 
						|
    }
 | 
						|
    df = DataFrame(data, index=common_idx)
 | 
						|
 | 
						|
    if values is None:
 | 
						|
        df["__dummy__"] = 0
 | 
						|
        kwargs = {"aggfunc": len, "fill_value": 0}
 | 
						|
    else:
 | 
						|
        df["__dummy__"] = values
 | 
						|
        kwargs = {"aggfunc": aggfunc}
 | 
						|
 | 
						|
    table = df.pivot_table(
 | 
						|
        "__dummy__",
 | 
						|
        index=unique_rownames,
 | 
						|
        columns=unique_colnames,
 | 
						|
        margins=margins,
 | 
						|
        margins_name=margins_name,
 | 
						|
        dropna=dropna,
 | 
						|
        **kwargs,
 | 
						|
    )
 | 
						|
 | 
						|
    # Post-process
 | 
						|
    if normalize is not False:
 | 
						|
        table = _normalize(
 | 
						|
            table, normalize=normalize, margins=margins, margins_name=margins_name
 | 
						|
        )
 | 
						|
 | 
						|
    table = table.rename_axis(index=rownames_mapper, axis=0)
 | 
						|
    table = table.rename_axis(columns=colnames_mapper, axis=1)
 | 
						|
 | 
						|
    return table
 | 
						|
 | 
						|
 | 
						|
def _normalize(
 | 
						|
    table: DataFrame, normalize, margins: bool, margins_name="All"
 | 
						|
) -> DataFrame:
 | 
						|
 | 
						|
    if not isinstance(normalize, (bool, str)):
 | 
						|
        axis_subs = {0: "index", 1: "columns"}
 | 
						|
        try:
 | 
						|
            normalize = axis_subs[normalize]
 | 
						|
        except KeyError as err:
 | 
						|
            raise ValueError("Not a valid normalize argument") from err
 | 
						|
 | 
						|
    if margins is False:
 | 
						|
 | 
						|
        # Actual Normalizations
 | 
						|
        normalizers: dict[bool | str, Callable] = {
 | 
						|
            "all": lambda x: x / x.sum(axis=1).sum(axis=0),
 | 
						|
            "columns": lambda x: x / x.sum(),
 | 
						|
            "index": lambda x: x.div(x.sum(axis=1), axis=0),
 | 
						|
        }
 | 
						|
 | 
						|
        normalizers[True] = normalizers["all"]
 | 
						|
 | 
						|
        try:
 | 
						|
            f = normalizers[normalize]
 | 
						|
        except KeyError as err:
 | 
						|
            raise ValueError("Not a valid normalize argument") from err
 | 
						|
 | 
						|
        table = f(table)
 | 
						|
        table = table.fillna(0)
 | 
						|
 | 
						|
    elif margins is True:
 | 
						|
        # keep index and column of pivoted table
 | 
						|
        table_index = table.index
 | 
						|
        table_columns = table.columns
 | 
						|
        last_ind_or_col = table.iloc[-1, :].name
 | 
						|
 | 
						|
        # check if margin name is not in (for MI cases) and not equal to last
 | 
						|
        # index/column and save the column and index margin
 | 
						|
        if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
 | 
						|
            raise ValueError(f"{margins_name} not in pivoted DataFrame")
 | 
						|
        column_margin = table.iloc[:-1, -1]
 | 
						|
        index_margin = table.iloc[-1, :-1]
 | 
						|
 | 
						|
        # keep the core table
 | 
						|
        table = table.iloc[:-1, :-1]
 | 
						|
 | 
						|
        # Normalize core
 | 
						|
        table = _normalize(table, normalize=normalize, margins=False)
 | 
						|
 | 
						|
        # Fix Margins
 | 
						|
        if normalize == "columns":
 | 
						|
            column_margin = column_margin / column_margin.sum()
 | 
						|
            table = concat([table, column_margin], axis=1)
 | 
						|
            table = table.fillna(0)
 | 
						|
            table.columns = table_columns
 | 
						|
 | 
						|
        elif normalize == "index":
 | 
						|
            index_margin = index_margin / index_margin.sum()
 | 
						|
            table = table._append(index_margin)
 | 
						|
            table = table.fillna(0)
 | 
						|
            table.index = table_index
 | 
						|
 | 
						|
        elif normalize == "all" or normalize is True:
 | 
						|
            column_margin = column_margin / column_margin.sum()
 | 
						|
            index_margin = index_margin / index_margin.sum()
 | 
						|
            index_margin.loc[margins_name] = 1
 | 
						|
            table = concat([table, column_margin], axis=1)
 | 
						|
            table = table._append(index_margin)
 | 
						|
 | 
						|
            table = table.fillna(0)
 | 
						|
            table.index = table_index
 | 
						|
            table.columns = table_columns
 | 
						|
 | 
						|
        else:
 | 
						|
            raise ValueError("Not a valid normalize argument")
 | 
						|
 | 
						|
    else:
 | 
						|
        raise ValueError("Not a valid margins argument")
 | 
						|
 | 
						|
    return table
 | 
						|
 | 
						|
 | 
						|
def _get_names(arrs, names, prefix: str = "row"):
 | 
						|
    if names is None:
 | 
						|
        names = []
 | 
						|
        for i, arr in enumerate(arrs):
 | 
						|
            if isinstance(arr, ABCSeries) and arr.name is not None:
 | 
						|
                names.append(arr.name)
 | 
						|
            else:
 | 
						|
                names.append(f"{prefix}_{i}")
 | 
						|
    else:
 | 
						|
        if len(names) != len(arrs):
 | 
						|
            raise AssertionError("arrays and names must have the same length")
 | 
						|
        if not isinstance(names, list):
 | 
						|
            names = list(names)
 | 
						|
 | 
						|
    return names
 | 
						|
 | 
						|
 | 
						|
def _build_names_mapper(
 | 
						|
    rownames: list[str], colnames: list[str]
 | 
						|
) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
 | 
						|
    """
 | 
						|
    Given the names of a DataFrame's rows and columns, returns a set of unique row
 | 
						|
    and column names and mappers that convert to original names.
 | 
						|
 | 
						|
    A row or column name is replaced if it is duplicate among the rows of the inputs,
 | 
						|
    among the columns of the inputs or between the rows and the columns.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    rownames: list[str]
 | 
						|
    colnames: list[str]
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
 | 
						|
 | 
						|
    rownames_mapper: dict[str, str]
 | 
						|
        a dictionary with new row names as keys and original rownames as values
 | 
						|
    unique_rownames: list[str]
 | 
						|
        a list of rownames with duplicate names replaced by dummy names
 | 
						|
    colnames_mapper: dict[str, str]
 | 
						|
        a dictionary with new column names as keys and original column names as values
 | 
						|
    unique_colnames: list[str]
 | 
						|
        a list of column names with duplicate names replaced by dummy names
 | 
						|
 | 
						|
    """
 | 
						|
 | 
						|
    def get_duplicates(names):
 | 
						|
        seen: set = set()
 | 
						|
        return {name for name in names if name not in seen}
 | 
						|
 | 
						|
    shared_names = set(rownames).intersection(set(colnames))
 | 
						|
    dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
 | 
						|
 | 
						|
    rownames_mapper = {
 | 
						|
        f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
 | 
						|
    }
 | 
						|
    unique_rownames = [
 | 
						|
        f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
 | 
						|
    ]
 | 
						|
 | 
						|
    colnames_mapper = {
 | 
						|
        f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
 | 
						|
    }
 | 
						|
    unique_colnames = [
 | 
						|
        f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
 | 
						|
    ]
 | 
						|
 | 
						|
    return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
 |