1144 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1144 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import annotations
 | 
						|
 | 
						|
import itertools
 | 
						|
from typing import TYPE_CHECKING
 | 
						|
import warnings
 | 
						|
 | 
						|
import numpy as np
 | 
						|
 | 
						|
import pandas._libs.reshape as libreshape
 | 
						|
from pandas._libs.sparse import IntIndex
 | 
						|
from pandas._typing import (
 | 
						|
    Dtype,
 | 
						|
    npt,
 | 
						|
)
 | 
						|
from pandas.errors import PerformanceWarning
 | 
						|
from pandas.util._decorators import cache_readonly
 | 
						|
 | 
						|
from pandas.core.dtypes.cast import maybe_promote
 | 
						|
from pandas.core.dtypes.common import (
 | 
						|
    ensure_platform_int,
 | 
						|
    is_1d_only_ea_dtype,
 | 
						|
    is_extension_array_dtype,
 | 
						|
    is_integer,
 | 
						|
    is_integer_dtype,
 | 
						|
    is_list_like,
 | 
						|
    is_object_dtype,
 | 
						|
    needs_i8_conversion,
 | 
						|
)
 | 
						|
from pandas.core.dtypes.dtypes import ExtensionDtype
 | 
						|
from pandas.core.dtypes.missing import notna
 | 
						|
 | 
						|
import pandas.core.algorithms as algos
 | 
						|
from pandas.core.arrays import SparseArray
 | 
						|
from pandas.core.arrays.categorical import factorize_from_iterable
 | 
						|
from pandas.core.construction import ensure_wrapped_if_datetimelike
 | 
						|
from pandas.core.frame import DataFrame
 | 
						|
from pandas.core.indexes.api import (
 | 
						|
    Index,
 | 
						|
    MultiIndex,
 | 
						|
)
 | 
						|
from pandas.core.indexes.frozen import FrozenList
 | 
						|
from pandas.core.series import Series
 | 
						|
from pandas.core.sorting import (
 | 
						|
    compress_group_index,
 | 
						|
    decons_obs_group_ids,
 | 
						|
    get_compressed_ids,
 | 
						|
    get_group_index,
 | 
						|
    get_group_index_sorter,
 | 
						|
)
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from pandas.core.arrays import ExtensionArray
 | 
						|
 | 
						|
 | 
						|
class _Unstacker:
 | 
						|
    """
 | 
						|
    Helper class to unstack data / pivot with multi-level index
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    index : MultiIndex
 | 
						|
    level : int or str, default last level
 | 
						|
        Level to "unstack". Accepts a name for the level.
 | 
						|
    fill_value : scalar, optional
 | 
						|
        Default value to fill in missing values if subgroups do not have the
 | 
						|
        same set of labels. By default, missing values will be replaced with
 | 
						|
        the default fill value for that data type, NaN for float, NaT for
 | 
						|
        datetimelike, etc. For integer types, by default data will converted to
 | 
						|
        float and missing values will be set to NaN.
 | 
						|
    constructor : object
 | 
						|
        Pandas ``DataFrame`` or subclass used to create unstacked
 | 
						|
        response.  If None, DataFrame will be used.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
 | 
						|
    ...                                    ('two', 'a'), ('two', 'b')])
 | 
						|
    >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
 | 
						|
    >>> s
 | 
						|
    one  a    1
 | 
						|
         b    2
 | 
						|
    two  a    3
 | 
						|
         b    4
 | 
						|
    dtype: int64
 | 
						|
 | 
						|
    >>> s.unstack(level=-1)
 | 
						|
         a  b
 | 
						|
    one  1  2
 | 
						|
    two  3  4
 | 
						|
 | 
						|
    >>> s.unstack(level=0)
 | 
						|
       one  two
 | 
						|
    a    1    3
 | 
						|
    b    2    4
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    unstacked : DataFrame
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(self, index: MultiIndex, level=-1, constructor=None):
 | 
						|
 | 
						|
        if constructor is None:
 | 
						|
            constructor = DataFrame
 | 
						|
        self.constructor = constructor
 | 
						|
 | 
						|
        self.index = index.remove_unused_levels()
 | 
						|
 | 
						|
        self.level = self.index._get_level_number(level)
 | 
						|
 | 
						|
        # when index includes `nan`, need to lift levels/strides by 1
 | 
						|
        self.lift = 1 if -1 in self.index.codes[self.level] else 0
 | 
						|
 | 
						|
        # Note: the "pop" below alters these in-place.
 | 
						|
        self.new_index_levels = list(self.index.levels)
 | 
						|
        self.new_index_names = list(self.index.names)
 | 
						|
 | 
						|
        self.removed_name = self.new_index_names.pop(self.level)
 | 
						|
        self.removed_level = self.new_index_levels.pop(self.level)
 | 
						|
        self.removed_level_full = index.levels[self.level]
 | 
						|
 | 
						|
        # Bug fix GH 20601
 | 
						|
        # If the data frame is too big, the number of unique index combination
 | 
						|
        # will cause int32 overflow on windows environments.
 | 
						|
        # We want to check and raise an error before this happens
 | 
						|
        num_rows = np.max([index_level.size for index_level in self.new_index_levels])
 | 
						|
        num_columns = self.removed_level.size
 | 
						|
 | 
						|
        # GH20601: This forces an overflow if the number of cells is too high.
 | 
						|
        num_cells = num_rows * num_columns
 | 
						|
 | 
						|
        # GH 26314: Previous ValueError raised was too restrictive for many users.
 | 
						|
        if num_cells > np.iinfo(np.int32).max:
 | 
						|
            warnings.warn(
 | 
						|
                f"The following operation may generate {num_cells} cells "
 | 
						|
                f"in the resulting pandas object.",
 | 
						|
                PerformanceWarning,
 | 
						|
            )
 | 
						|
 | 
						|
        self._make_selectors()
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def _indexer_and_to_sort(
 | 
						|
        self,
 | 
						|
    ) -> tuple[
 | 
						|
        npt.NDArray[np.intp],
 | 
						|
        list[np.ndarray],  # each has _some_ signed integer dtype
 | 
						|
    ]:
 | 
						|
        v = self.level
 | 
						|
 | 
						|
        codes = list(self.index.codes)
 | 
						|
        levs = list(self.index.levels)
 | 
						|
        to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
 | 
						|
        sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
 | 
						|
 | 
						|
        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
 | 
						|
        ngroups = len(obs_ids)
 | 
						|
 | 
						|
        indexer = get_group_index_sorter(comp_index, ngroups)
 | 
						|
        return indexer, to_sort
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def sorted_labels(self):
 | 
						|
        indexer, to_sort = self._indexer_and_to_sort
 | 
						|
        return [line.take(indexer) for line in to_sort]
 | 
						|
 | 
						|
    def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
 | 
						|
        indexer, _ = self._indexer_and_to_sort
 | 
						|
 | 
						|
        sorted_values = algos.take_nd(values, indexer, axis=0)
 | 
						|
        return sorted_values
 | 
						|
 | 
						|
    def _make_selectors(self):
 | 
						|
        new_levels = self.new_index_levels
 | 
						|
 | 
						|
        # make the mask
 | 
						|
        remaining_labels = self.sorted_labels[:-1]
 | 
						|
        level_sizes = tuple(len(x) for x in new_levels)
 | 
						|
 | 
						|
        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
 | 
						|
        ngroups = len(obs_ids)
 | 
						|
 | 
						|
        comp_index = ensure_platform_int(comp_index)
 | 
						|
        stride = self.index.levshape[self.level] + self.lift
 | 
						|
        self.full_shape = ngroups, stride
 | 
						|
 | 
						|
        selector = self.sorted_labels[-1] + stride * comp_index + self.lift
 | 
						|
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
 | 
						|
        mask.put(selector, True)
 | 
						|
 | 
						|
        if mask.sum() < len(self.index):
 | 
						|
            raise ValueError("Index contains duplicate entries, cannot reshape")
 | 
						|
 | 
						|
        self.group_index = comp_index
 | 
						|
        self.mask = mask
 | 
						|
        self.unique_groups = obs_ids
 | 
						|
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def mask_all(self) -> bool:
 | 
						|
        return bool(self.mask.all())
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
 | 
						|
        # We cache this for re-use in ExtensionBlock._unstack
 | 
						|
        dummy_arr = np.arange(len(self.index), dtype=np.intp)
 | 
						|
        new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
 | 
						|
        return new_values, mask.any(0)
 | 
						|
        # TODO: in all tests we have mask.any(0).all(); can we rely on that?
 | 
						|
 | 
						|
    def get_result(self, values, value_columns, fill_value):
 | 
						|
 | 
						|
        if values.ndim == 1:
 | 
						|
            values = values[:, np.newaxis]
 | 
						|
 | 
						|
        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
 | 
						|
            raise ValueError("must pass column labels for multi-column data")
 | 
						|
 | 
						|
        values, _ = self.get_new_values(values, fill_value)
 | 
						|
        columns = self.get_new_columns(value_columns)
 | 
						|
        index = self.new_index
 | 
						|
 | 
						|
        return self.constructor(
 | 
						|
            values, index=index, columns=columns, dtype=values.dtype
 | 
						|
        )
 | 
						|
 | 
						|
    def get_new_values(self, values, fill_value=None):
 | 
						|
 | 
						|
        if values.ndim == 1:
 | 
						|
            values = values[:, np.newaxis]
 | 
						|
 | 
						|
        sorted_values = self._make_sorted_values(values)
 | 
						|
 | 
						|
        # place the values
 | 
						|
        length, width = self.full_shape
 | 
						|
        stride = values.shape[1]
 | 
						|
        result_width = width * stride
 | 
						|
        result_shape = (length, result_width)
 | 
						|
        mask = self.mask
 | 
						|
        mask_all = self.mask_all
 | 
						|
 | 
						|
        # we can simply reshape if we don't have a mask
 | 
						|
        if mask_all and len(values):
 | 
						|
            # TODO: Under what circumstances can we rely on sorted_values
 | 
						|
            #  matching values?  When that holds, we can slice instead
 | 
						|
            #  of take (in particular for EAs)
 | 
						|
            new_values = (
 | 
						|
                sorted_values.reshape(length, width, stride)
 | 
						|
                .swapaxes(1, 2)
 | 
						|
                .reshape(result_shape)
 | 
						|
            )
 | 
						|
            new_mask = np.ones(result_shape, dtype=bool)
 | 
						|
            return new_values, new_mask
 | 
						|
 | 
						|
        dtype = values.dtype
 | 
						|
 | 
						|
        # if our mask is all True, then we can use our existing dtype
 | 
						|
        if mask_all:
 | 
						|
            dtype = values.dtype
 | 
						|
            new_values = np.empty(result_shape, dtype=dtype)
 | 
						|
        else:
 | 
						|
            if isinstance(dtype, ExtensionDtype):
 | 
						|
                # GH#41875
 | 
						|
                cls = dtype.construct_array_type()
 | 
						|
                new_values = cls._empty(result_shape, dtype=dtype)
 | 
						|
                new_values[:] = fill_value
 | 
						|
            else:
 | 
						|
                dtype, fill_value = maybe_promote(dtype, fill_value)
 | 
						|
                new_values = np.empty(result_shape, dtype=dtype)
 | 
						|
                new_values.fill(fill_value)
 | 
						|
 | 
						|
        name = dtype.name
 | 
						|
        new_mask = np.zeros(result_shape, dtype=bool)
 | 
						|
 | 
						|
        # we need to convert to a basic dtype
 | 
						|
        # and possibly coerce an input to our output dtype
 | 
						|
        # e.g. ints -> floats
 | 
						|
        if needs_i8_conversion(values.dtype):
 | 
						|
            sorted_values = sorted_values.view("i8")
 | 
						|
            new_values = new_values.view("i8")
 | 
						|
        else:
 | 
						|
            sorted_values = sorted_values.astype(name, copy=False)
 | 
						|
 | 
						|
        # fill in our values & mask
 | 
						|
        libreshape.unstack(
 | 
						|
            sorted_values,
 | 
						|
            mask.view("u1"),
 | 
						|
            stride,
 | 
						|
            length,
 | 
						|
            width,
 | 
						|
            new_values,
 | 
						|
            new_mask.view("u1"),
 | 
						|
        )
 | 
						|
 | 
						|
        # reconstruct dtype if needed
 | 
						|
        if needs_i8_conversion(values.dtype):
 | 
						|
            # view as datetime64 so we can wrap in DatetimeArray and use
 | 
						|
            #  DTA's view method
 | 
						|
            new_values = new_values.view("M8[ns]")
 | 
						|
            new_values = ensure_wrapped_if_datetimelike(new_values)
 | 
						|
            new_values = new_values.view(values.dtype)
 | 
						|
 | 
						|
        return new_values, new_mask
 | 
						|
 | 
						|
    def get_new_columns(self, value_columns: Index | None):
 | 
						|
        if value_columns is None:
 | 
						|
            if self.lift == 0:
 | 
						|
                return self.removed_level._rename(name=self.removed_name)
 | 
						|
 | 
						|
            lev = self.removed_level.insert(0, item=self.removed_level._na_value)
 | 
						|
            return lev.rename(self.removed_name)
 | 
						|
 | 
						|
        stride = len(self.removed_level) + self.lift
 | 
						|
        width = len(value_columns)
 | 
						|
        propagator = np.repeat(np.arange(width), stride)
 | 
						|
 | 
						|
        new_levels: FrozenList | list[Index]
 | 
						|
 | 
						|
        if isinstance(value_columns, MultiIndex):
 | 
						|
            new_levels = value_columns.levels + (self.removed_level_full,)
 | 
						|
            new_names = value_columns.names + (self.removed_name,)
 | 
						|
 | 
						|
            new_codes = [lab.take(propagator) for lab in value_columns.codes]
 | 
						|
        else:
 | 
						|
            new_levels = [
 | 
						|
                value_columns,
 | 
						|
                self.removed_level_full,
 | 
						|
            ]
 | 
						|
            new_names = [value_columns.name, self.removed_name]
 | 
						|
            new_codes = [propagator]
 | 
						|
 | 
						|
        repeater = self._repeater
 | 
						|
 | 
						|
        # The entire level is then just a repetition of the single chunk:
 | 
						|
        new_codes.append(np.tile(repeater, width))
 | 
						|
        return MultiIndex(
 | 
						|
            levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
 | 
						|
        )
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def _repeater(self) -> np.ndarray:
 | 
						|
        # The two indices differ only if the unstacked level had unused items:
 | 
						|
        if len(self.removed_level_full) != len(self.removed_level):
 | 
						|
            # In this case, we remap the new codes to the original level:
 | 
						|
            repeater = self.removed_level_full.get_indexer(self.removed_level)
 | 
						|
            if self.lift:
 | 
						|
                repeater = np.insert(repeater, 0, -1)
 | 
						|
        else:
 | 
						|
            # Otherwise, we just use each level item exactly once:
 | 
						|
            stride = len(self.removed_level) + self.lift
 | 
						|
            repeater = np.arange(stride) - self.lift
 | 
						|
 | 
						|
        return repeater
 | 
						|
 | 
						|
    @cache_readonly
 | 
						|
    def new_index(self):
 | 
						|
        # Does not depend on values or value_columns
 | 
						|
        result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
 | 
						|
 | 
						|
        # construct the new index
 | 
						|
        if len(self.new_index_levels) == 1:
 | 
						|
            level, level_codes = self.new_index_levels[0], result_codes[0]
 | 
						|
            if (level_codes == -1).any():
 | 
						|
                level = level.insert(len(level), level._na_value)
 | 
						|
            return level.take(level_codes).rename(self.new_index_names[0])
 | 
						|
 | 
						|
        return MultiIndex(
 | 
						|
            levels=self.new_index_levels,
 | 
						|
            codes=result_codes,
 | 
						|
            names=self.new_index_names,
 | 
						|
            verify_integrity=False,
 | 
						|
        )
 | 
						|
 | 
						|
 | 
						|
def _unstack_multiple(data, clocs, fill_value=None):
 | 
						|
    if len(clocs) == 0:
 | 
						|
        return data
 | 
						|
 | 
						|
    # NOTE: This doesn't deal with hierarchical columns yet
 | 
						|
 | 
						|
    index = data.index
 | 
						|
 | 
						|
    # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
 | 
						|
    # recognised as a whole
 | 
						|
    if clocs in index.names:
 | 
						|
        clocs = [clocs]
 | 
						|
    clocs = [index._get_level_number(i) for i in clocs]
 | 
						|
 | 
						|
    rlocs = [i for i in range(index.nlevels) if i not in clocs]
 | 
						|
 | 
						|
    clevels = [index.levels[i] for i in clocs]
 | 
						|
    ccodes = [index.codes[i] for i in clocs]
 | 
						|
    cnames = [index.names[i] for i in clocs]
 | 
						|
    rlevels = [index.levels[i] for i in rlocs]
 | 
						|
    rcodes = [index.codes[i] for i in rlocs]
 | 
						|
    rnames = [index.names[i] for i in rlocs]
 | 
						|
 | 
						|
    shape = tuple(len(x) for x in clevels)
 | 
						|
    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
 | 
						|
 | 
						|
    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
 | 
						|
    recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
 | 
						|
 | 
						|
    if not rlocs:
 | 
						|
        # Everything is in clocs, so the dummy df has a regular index
 | 
						|
        dummy_index = Index(obs_ids, name="__placeholder__")
 | 
						|
    else:
 | 
						|
        dummy_index = MultiIndex(
 | 
						|
            levels=rlevels + [obs_ids],
 | 
						|
            codes=rcodes + [comp_ids],
 | 
						|
            names=rnames + ["__placeholder__"],
 | 
						|
            verify_integrity=False,
 | 
						|
        )
 | 
						|
 | 
						|
    if isinstance(data, Series):
 | 
						|
        dummy = data.copy()
 | 
						|
        dummy.index = dummy_index
 | 
						|
 | 
						|
        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
 | 
						|
        new_levels = clevels
 | 
						|
        new_names = cnames
 | 
						|
        new_codes = recons_codes
 | 
						|
    else:
 | 
						|
        if isinstance(data.columns, MultiIndex):
 | 
						|
            result = data
 | 
						|
            for i in range(len(clocs)):
 | 
						|
                val = clocs[i]
 | 
						|
                result = result.unstack(val, fill_value=fill_value)
 | 
						|
                clocs = [v if v < val else v - 1 for v in clocs]
 | 
						|
 | 
						|
            return result
 | 
						|
 | 
						|
        # GH#42579 deep=False to avoid consolidating
 | 
						|
        dummy = data.copy(deep=False)
 | 
						|
        dummy.index = dummy_index
 | 
						|
 | 
						|
        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
 | 
						|
        if isinstance(unstacked, Series):
 | 
						|
            unstcols = unstacked.index
 | 
						|
        else:
 | 
						|
            unstcols = unstacked.columns
 | 
						|
        assert isinstance(unstcols, MultiIndex)  # for mypy
 | 
						|
        new_levels = [unstcols.levels[0]] + clevels
 | 
						|
        new_names = [data.columns.name] + cnames
 | 
						|
 | 
						|
        new_codes = [unstcols.codes[0]]
 | 
						|
        for rec in recons_codes:
 | 
						|
            new_codes.append(rec.take(unstcols.codes[-1]))
 | 
						|
 | 
						|
    new_columns = MultiIndex(
 | 
						|
        levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
 | 
						|
    )
 | 
						|
 | 
						|
    if isinstance(unstacked, Series):
 | 
						|
        unstacked.index = new_columns
 | 
						|
    else:
 | 
						|
        unstacked.columns = new_columns
 | 
						|
 | 
						|
    return unstacked
 | 
						|
 | 
						|
 | 
						|
def unstack(obj, level, fill_value=None):
 | 
						|
 | 
						|
    if isinstance(level, (tuple, list)):
 | 
						|
        if len(level) != 1:
 | 
						|
            # _unstack_multiple only handles MultiIndexes,
 | 
						|
            # and isn't needed for a single level
 | 
						|
            return _unstack_multiple(obj, level, fill_value=fill_value)
 | 
						|
        else:
 | 
						|
            level = level[0]
 | 
						|
 | 
						|
    # Prioritize integer interpretation (GH #21677):
 | 
						|
    if not is_integer(level) and not level == "__placeholder__":
 | 
						|
        level = obj.index._get_level_number(level)
 | 
						|
 | 
						|
    if isinstance(obj, DataFrame):
 | 
						|
        if isinstance(obj.index, MultiIndex):
 | 
						|
            return _unstack_frame(obj, level, fill_value=fill_value)
 | 
						|
        else:
 | 
						|
            return obj.T.stack(dropna=False)
 | 
						|
    elif not isinstance(obj.index, MultiIndex):
 | 
						|
        # GH 36113
 | 
						|
        # Give nicer error messages when unstack a Series whose
 | 
						|
        # Index is not a MultiIndex.
 | 
						|
        raise ValueError(
 | 
						|
            f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
 | 
						|
        )
 | 
						|
    else:
 | 
						|
        if is_1d_only_ea_dtype(obj.dtype):
 | 
						|
            return _unstack_extension_series(obj, level, fill_value)
 | 
						|
        unstacker = _Unstacker(
 | 
						|
            obj.index, level=level, constructor=obj._constructor_expanddim
 | 
						|
        )
 | 
						|
        return unstacker.get_result(
 | 
						|
            obj._values, value_columns=None, fill_value=fill_value
 | 
						|
        )
 | 
						|
 | 
						|
 | 
						|
def _unstack_frame(obj, level, fill_value=None):
 | 
						|
    if not obj._can_fast_transpose:
 | 
						|
        unstacker = _Unstacker(obj.index, level=level)
 | 
						|
        mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
 | 
						|
        return obj._constructor(mgr)
 | 
						|
    else:
 | 
						|
        unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor)
 | 
						|
        return unstacker.get_result(
 | 
						|
            obj._values, value_columns=obj.columns, fill_value=fill_value
 | 
						|
        )
 | 
						|
 | 
						|
 | 
						|
def _unstack_extension_series(series, level, fill_value):
 | 
						|
    """
 | 
						|
    Unstack an ExtensionArray-backed Series.
 | 
						|
 | 
						|
    The ExtensionDtype is preserved.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    series : Series
 | 
						|
        A Series with an ExtensionArray for values
 | 
						|
    level : Any
 | 
						|
        The level name or number.
 | 
						|
    fill_value : Any
 | 
						|
        The user-level (not physical storage) fill value to use for
 | 
						|
        missing values introduced by the reshape. Passed to
 | 
						|
        ``series.values.take``.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    DataFrame
 | 
						|
        Each column of the DataFrame will have the same dtype as
 | 
						|
        the input Series.
 | 
						|
    """
 | 
						|
    # Defer to the logic in ExtensionBlock._unstack
 | 
						|
    df = series.to_frame()
 | 
						|
    result = df.unstack(level=level, fill_value=fill_value)
 | 
						|
 | 
						|
    # equiv: result.droplevel(level=0, axis=1)
 | 
						|
    #  but this avoids an extra copy
 | 
						|
    result.columns = result.columns.droplevel(0)
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def stack(frame, level=-1, dropna=True):
 | 
						|
    """
 | 
						|
    Convert DataFrame to Series with multi-level Index. Columns become the
 | 
						|
    second level of the resulting hierarchical index
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    stacked : Series
 | 
						|
    """
 | 
						|
 | 
						|
    def factorize(index):
 | 
						|
        if index.is_unique:
 | 
						|
            return index, np.arange(len(index))
 | 
						|
        codes, categories = factorize_from_iterable(index)
 | 
						|
        return categories, codes
 | 
						|
 | 
						|
    N, K = frame.shape
 | 
						|
 | 
						|
    # Will also convert negative level numbers and check if out of bounds.
 | 
						|
    level_num = frame.columns._get_level_number(level)
 | 
						|
 | 
						|
    if isinstance(frame.columns, MultiIndex):
 | 
						|
        return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
 | 
						|
    elif isinstance(frame.index, MultiIndex):
 | 
						|
        new_levels = list(frame.index.levels)
 | 
						|
        new_codes = [lab.repeat(K) for lab in frame.index.codes]
 | 
						|
 | 
						|
        clev, clab = factorize(frame.columns)
 | 
						|
        new_levels.append(clev)
 | 
						|
        new_codes.append(np.tile(clab, N).ravel())
 | 
						|
 | 
						|
        new_names = list(frame.index.names)
 | 
						|
        new_names.append(frame.columns.name)
 | 
						|
        new_index = MultiIndex(
 | 
						|
            levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
 | 
						|
        )
 | 
						|
    else:
 | 
						|
        levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
 | 
						|
        codes = ilab.repeat(K), np.tile(clab, N).ravel()
 | 
						|
        new_index = MultiIndex(
 | 
						|
            levels=levels,
 | 
						|
            codes=codes,
 | 
						|
            names=[frame.index.name, frame.columns.name],
 | 
						|
            verify_integrity=False,
 | 
						|
        )
 | 
						|
 | 
						|
    if not frame.empty and frame._is_homogeneous_type:
 | 
						|
        # For homogeneous EAs, frame._values will coerce to object. So
 | 
						|
        # we concatenate instead.
 | 
						|
        dtypes = list(frame.dtypes._values)
 | 
						|
        dtype = dtypes[0]
 | 
						|
 | 
						|
        if is_extension_array_dtype(dtype):
 | 
						|
            arr = dtype.construct_array_type()
 | 
						|
            new_values = arr._concat_same_type(
 | 
						|
                [col._values for _, col in frame.items()]
 | 
						|
            )
 | 
						|
            new_values = _reorder_for_extension_array_stack(new_values, N, K)
 | 
						|
        else:
 | 
						|
            # homogeneous, non-EA
 | 
						|
            new_values = frame._values.ravel()
 | 
						|
 | 
						|
    else:
 | 
						|
        # non-homogeneous
 | 
						|
        new_values = frame._values.ravel()
 | 
						|
 | 
						|
    if dropna:
 | 
						|
        mask = notna(new_values)
 | 
						|
        new_values = new_values[mask]
 | 
						|
        new_index = new_index[mask]
 | 
						|
 | 
						|
    return frame._constructor_sliced(new_values, index=new_index)
 | 
						|
 | 
						|
 | 
						|
def stack_multiple(frame, level, dropna=True):
 | 
						|
    # If all passed levels match up to column names, no
 | 
						|
    # ambiguity about what to do
 | 
						|
    if all(lev in frame.columns.names for lev in level):
 | 
						|
        result = frame
 | 
						|
        for lev in level:
 | 
						|
            result = stack(result, lev, dropna=dropna)
 | 
						|
 | 
						|
    # Otherwise, level numbers may change as each successive level is stacked
 | 
						|
    elif all(isinstance(lev, int) for lev in level):
 | 
						|
        # As each stack is done, the level numbers decrease, so we need
 | 
						|
        #  to account for that when level is a sequence of ints
 | 
						|
        result = frame
 | 
						|
        # _get_level_number() checks level numbers are in range and converts
 | 
						|
        # negative numbers to positive
 | 
						|
        level = [frame.columns._get_level_number(lev) for lev in level]
 | 
						|
 | 
						|
        # Can't iterate directly through level as we might need to change
 | 
						|
        # values as we go
 | 
						|
        for index in range(len(level)):
 | 
						|
            lev = level[index]
 | 
						|
            result = stack(result, lev, dropna=dropna)
 | 
						|
            # Decrement all level numbers greater than current, as these
 | 
						|
            # have now shifted down by one
 | 
						|
            updated_level = []
 | 
						|
            for other in level:
 | 
						|
                if other > lev:
 | 
						|
                    updated_level.append(other - 1)
 | 
						|
                else:
 | 
						|
                    updated_level.append(other)
 | 
						|
            level = updated_level
 | 
						|
 | 
						|
    else:
 | 
						|
        raise ValueError(
 | 
						|
            "level should contain all level names or all level "
 | 
						|
            "numbers, not a mixture of the two."
 | 
						|
        )
 | 
						|
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
 | 
						|
    """Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
 | 
						|
    if len(columns.levels) <= 2:
 | 
						|
        return columns.levels[0]._rename(name=columns.names[0])
 | 
						|
 | 
						|
    levs = [
 | 
						|
        [lev[c] if c >= 0 else None for c in codes]
 | 
						|
        for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
 | 
						|
    ]
 | 
						|
 | 
						|
    # Remove duplicate tuples in the MultiIndex.
 | 
						|
    tuples = zip(*levs)
 | 
						|
    unique_tuples = (key for key, _ in itertools.groupby(tuples))
 | 
						|
    new_levs = zip(*unique_tuples)
 | 
						|
 | 
						|
    # The dtype of each level must be explicitly set to avoid inferring the wrong type.
 | 
						|
    # See GH-36991.
 | 
						|
    return MultiIndex.from_arrays(
 | 
						|
        [
 | 
						|
            # Not all indices can accept None values.
 | 
						|
            Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
 | 
						|
            for new_lev, lev in zip(new_levs, columns.levels)
 | 
						|
        ],
 | 
						|
        names=columns.names[:-1],
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def _stack_multi_columns(frame, level_num=-1, dropna=True):
 | 
						|
    def _convert_level_number(level_num: int, columns):
 | 
						|
        """
 | 
						|
        Logic for converting the level number to something we can safely pass
 | 
						|
        to swaplevel.
 | 
						|
 | 
						|
        If `level_num` matches a column name return the name from
 | 
						|
        position `level_num`, otherwise return `level_num`.
 | 
						|
        """
 | 
						|
        if level_num in columns.names:
 | 
						|
            return columns.names[level_num]
 | 
						|
 | 
						|
        return level_num
 | 
						|
 | 
						|
    this = frame.copy()
 | 
						|
 | 
						|
    # this makes life much simpler
 | 
						|
    if level_num != frame.columns.nlevels - 1:
 | 
						|
        # roll levels to put selected level at end
 | 
						|
        roll_columns = this.columns
 | 
						|
        for i in range(level_num, frame.columns.nlevels - 1):
 | 
						|
            # Need to check if the ints conflict with level names
 | 
						|
            lev1 = _convert_level_number(i, roll_columns)
 | 
						|
            lev2 = _convert_level_number(i + 1, roll_columns)
 | 
						|
            roll_columns = roll_columns.swaplevel(lev1, lev2)
 | 
						|
        this.columns = roll_columns
 | 
						|
 | 
						|
    if not this.columns._is_lexsorted():
 | 
						|
        # Workaround the edge case where 0 is one of the column names,
 | 
						|
        # which interferes with trying to sort based on the first
 | 
						|
        # level
 | 
						|
        level_to_sort = _convert_level_number(0, this.columns)
 | 
						|
        this = this.sort_index(level=level_to_sort, axis=1)
 | 
						|
 | 
						|
    new_columns = _stack_multi_column_index(this.columns)
 | 
						|
 | 
						|
    # time to ravel the values
 | 
						|
    new_data = {}
 | 
						|
    level_vals = this.columns.levels[-1]
 | 
						|
    level_codes = sorted(set(this.columns.codes[-1]))
 | 
						|
    level_vals_nan = level_vals.insert(len(level_vals), None)
 | 
						|
 | 
						|
    level_vals_used = np.take(level_vals_nan, level_codes)
 | 
						|
    levsize = len(level_codes)
 | 
						|
    drop_cols = []
 | 
						|
    for key in new_columns:
 | 
						|
        try:
 | 
						|
            loc = this.columns.get_loc(key)
 | 
						|
        except KeyError:
 | 
						|
            drop_cols.append(key)
 | 
						|
            continue
 | 
						|
 | 
						|
        # can make more efficient?
 | 
						|
        # we almost always return a slice
 | 
						|
        # but if unsorted can get a boolean
 | 
						|
        # indexer
 | 
						|
        if not isinstance(loc, slice):
 | 
						|
            slice_len = len(loc)
 | 
						|
        else:
 | 
						|
            slice_len = loc.stop - loc.start
 | 
						|
 | 
						|
        if slice_len != levsize:
 | 
						|
            chunk = this.loc[:, this.columns[loc]]
 | 
						|
            chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
 | 
						|
            value_slice = chunk.reindex(columns=level_vals_used).values
 | 
						|
        else:
 | 
						|
            if frame._is_homogeneous_type and is_extension_array_dtype(
 | 
						|
                frame.dtypes.iloc[0]
 | 
						|
            ):
 | 
						|
                # TODO(EA2D): won't need special case, can go through .values
 | 
						|
                #  paths below (might change to ._values)
 | 
						|
                dtype = this[this.columns[loc]].dtypes.iloc[0]
 | 
						|
                subset = this[this.columns[loc]]
 | 
						|
 | 
						|
                value_slice = dtype.construct_array_type()._concat_same_type(
 | 
						|
                    [x._values for _, x in subset.items()]
 | 
						|
                )
 | 
						|
                N, K = subset.shape
 | 
						|
                idx = np.arange(N * K).reshape(K, N).T.ravel()
 | 
						|
                value_slice = value_slice.take(idx)
 | 
						|
 | 
						|
            elif frame._is_mixed_type:
 | 
						|
                value_slice = this[this.columns[loc]].values
 | 
						|
            else:
 | 
						|
                value_slice = this.values[:, loc]
 | 
						|
 | 
						|
        if value_slice.ndim > 1:
 | 
						|
            # i.e. not extension
 | 
						|
            value_slice = value_slice.ravel()
 | 
						|
 | 
						|
        new_data[key] = value_slice
 | 
						|
 | 
						|
    if len(drop_cols) > 0:
 | 
						|
        new_columns = new_columns.difference(drop_cols)
 | 
						|
 | 
						|
    N = len(this)
 | 
						|
 | 
						|
    if isinstance(this.index, MultiIndex):
 | 
						|
        new_levels = list(this.index.levels)
 | 
						|
        new_names = list(this.index.names)
 | 
						|
        new_codes = [lab.repeat(levsize) for lab in this.index.codes]
 | 
						|
    else:
 | 
						|
        old_codes, old_levels = factorize_from_iterable(this.index)
 | 
						|
        new_levels = [old_levels]
 | 
						|
        new_codes = [old_codes.repeat(levsize)]
 | 
						|
        new_names = [this.index.name]  # something better?
 | 
						|
 | 
						|
    new_levels.append(level_vals)
 | 
						|
    new_codes.append(np.tile(level_codes, N))
 | 
						|
    new_names.append(frame.columns.names[level_num])
 | 
						|
 | 
						|
    new_index = MultiIndex(
 | 
						|
        levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
 | 
						|
    )
 | 
						|
 | 
						|
    result = frame._constructor(new_data, index=new_index, columns=new_columns)
 | 
						|
 | 
						|
    # more efficient way to go about this? can do the whole masking biz but
 | 
						|
    # will only save a small amount of time...
 | 
						|
    if dropna:
 | 
						|
        result = result.dropna(axis=0, how="all")
 | 
						|
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def get_dummies(
 | 
						|
    data,
 | 
						|
    prefix=None,
 | 
						|
    prefix_sep="_",
 | 
						|
    dummy_na: bool = False,
 | 
						|
    columns=None,
 | 
						|
    sparse: bool = False,
 | 
						|
    drop_first: bool = False,
 | 
						|
    dtype: Dtype | None = None,
 | 
						|
) -> DataFrame:
 | 
						|
    """
 | 
						|
    Convert categorical variable into dummy/indicator variables.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    data : array-like, Series, or DataFrame
 | 
						|
        Data of which to get dummy indicators.
 | 
						|
    prefix : str, list of str, or dict of str, default None
 | 
						|
        String to append DataFrame column names.
 | 
						|
        Pass a list with length equal to the number of columns
 | 
						|
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
 | 
						|
        can be a dictionary mapping column names to prefixes.
 | 
						|
    prefix_sep : str, default '_'
 | 
						|
        If appending prefix, separator/delimiter to use. Or pass a
 | 
						|
        list or dictionary as with `prefix`.
 | 
						|
    dummy_na : bool, default False
 | 
						|
        Add a column to indicate NaNs, if False NaNs are ignored.
 | 
						|
    columns : list-like, default None
 | 
						|
        Column names in the DataFrame to be encoded.
 | 
						|
        If `columns` is None then all the columns with
 | 
						|
        `object` or `category` dtype will be converted.
 | 
						|
    sparse : bool, default False
 | 
						|
        Whether the dummy-encoded columns should be backed by
 | 
						|
        a :class:`SparseArray` (True) or a regular NumPy array (False).
 | 
						|
    drop_first : bool, default False
 | 
						|
        Whether to get k-1 dummies out of k categorical levels by removing the
 | 
						|
        first level.
 | 
						|
    dtype : dtype, default np.uint8
 | 
						|
        Data type for new columns. Only a single dtype is allowed.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    DataFrame
 | 
						|
        Dummy-coded data.
 | 
						|
 | 
						|
    See Also
 | 
						|
    --------
 | 
						|
    Series.str.get_dummies : Convert Series to dummy codes.
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    Reference :ref:`the user guide <reshaping.dummies>` for more examples.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> s = pd.Series(list('abca'))
 | 
						|
 | 
						|
    >>> pd.get_dummies(s)
 | 
						|
       a  b  c
 | 
						|
    0  1  0  0
 | 
						|
    1  0  1  0
 | 
						|
    2  0  0  1
 | 
						|
    3  1  0  0
 | 
						|
 | 
						|
    >>> s1 = ['a', 'b', np.nan]
 | 
						|
 | 
						|
    >>> pd.get_dummies(s1)
 | 
						|
       a  b
 | 
						|
    0  1  0
 | 
						|
    1  0  1
 | 
						|
    2  0  0
 | 
						|
 | 
						|
    >>> pd.get_dummies(s1, dummy_na=True)
 | 
						|
       a  b  NaN
 | 
						|
    0  1  0    0
 | 
						|
    1  0  1    0
 | 
						|
    2  0  0    1
 | 
						|
 | 
						|
    >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
 | 
						|
    ...                    'C': [1, 2, 3]})
 | 
						|
 | 
						|
    >>> pd.get_dummies(df, prefix=['col1', 'col2'])
 | 
						|
       C  col1_a  col1_b  col2_a  col2_b  col2_c
 | 
						|
    0  1       1       0       0       1       0
 | 
						|
    1  2       0       1       1       0       0
 | 
						|
    2  3       1       0       0       0       1
 | 
						|
 | 
						|
    >>> pd.get_dummies(pd.Series(list('abcaa')))
 | 
						|
       a  b  c
 | 
						|
    0  1  0  0
 | 
						|
    1  0  1  0
 | 
						|
    2  0  0  1
 | 
						|
    3  1  0  0
 | 
						|
    4  1  0  0
 | 
						|
 | 
						|
    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
 | 
						|
       b  c
 | 
						|
    0  0  0
 | 
						|
    1  1  0
 | 
						|
    2  0  1
 | 
						|
    3  0  0
 | 
						|
    4  0  0
 | 
						|
 | 
						|
    >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
 | 
						|
         a    b    c
 | 
						|
    0  1.0  0.0  0.0
 | 
						|
    1  0.0  1.0  0.0
 | 
						|
    2  0.0  0.0  1.0
 | 
						|
    """
 | 
						|
    from pandas.core.reshape.concat import concat
 | 
						|
 | 
						|
    dtypes_to_encode = ["object", "category"]
 | 
						|
 | 
						|
    if isinstance(data, DataFrame):
 | 
						|
        # determine columns being encoded
 | 
						|
        if columns is None:
 | 
						|
            data_to_encode = data.select_dtypes(include=dtypes_to_encode)
 | 
						|
        elif not is_list_like(columns):
 | 
						|
            raise TypeError("Input must be a list-like for parameter `columns`")
 | 
						|
        else:
 | 
						|
            data_to_encode = data[columns]
 | 
						|
 | 
						|
        # validate prefixes and separator to avoid silently dropping cols
 | 
						|
        def check_len(item, name):
 | 
						|
 | 
						|
            if is_list_like(item):
 | 
						|
                if not len(item) == data_to_encode.shape[1]:
 | 
						|
                    len_msg = (
 | 
						|
                        f"Length of '{name}' ({len(item)}) did not match the "
 | 
						|
                        "length of the columns being encoded "
 | 
						|
                        f"({data_to_encode.shape[1]})."
 | 
						|
                    )
 | 
						|
                    raise ValueError(len_msg)
 | 
						|
 | 
						|
        check_len(prefix, "prefix")
 | 
						|
        check_len(prefix_sep, "prefix_sep")
 | 
						|
 | 
						|
        if isinstance(prefix, str):
 | 
						|
            prefix = itertools.cycle([prefix])
 | 
						|
        if isinstance(prefix, dict):
 | 
						|
            prefix = [prefix[col] for col in data_to_encode.columns]
 | 
						|
 | 
						|
        if prefix is None:
 | 
						|
            prefix = data_to_encode.columns
 | 
						|
 | 
						|
        # validate separators
 | 
						|
        if isinstance(prefix_sep, str):
 | 
						|
            prefix_sep = itertools.cycle([prefix_sep])
 | 
						|
        elif isinstance(prefix_sep, dict):
 | 
						|
            prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
 | 
						|
 | 
						|
        with_dummies: list[DataFrame]
 | 
						|
        if data_to_encode.shape == data.shape:
 | 
						|
            # Encoding the entire df, do not prepend any dropped columns
 | 
						|
            with_dummies = []
 | 
						|
        elif columns is not None:
 | 
						|
            # Encoding only cols specified in columns. Get all cols not in
 | 
						|
            # columns to prepend to result.
 | 
						|
            with_dummies = [data.drop(columns, axis=1)]
 | 
						|
        else:
 | 
						|
            # Encoding only object and category dtype columns. Get remaining
 | 
						|
            # columns to prepend to result.
 | 
						|
            with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
 | 
						|
 | 
						|
        for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
 | 
						|
            # col is (column_name, column), use just column data here
 | 
						|
            dummy = _get_dummies_1d(
 | 
						|
                col[1],
 | 
						|
                prefix=pre,
 | 
						|
                prefix_sep=sep,
 | 
						|
                dummy_na=dummy_na,
 | 
						|
                sparse=sparse,
 | 
						|
                drop_first=drop_first,
 | 
						|
                dtype=dtype,
 | 
						|
            )
 | 
						|
            with_dummies.append(dummy)
 | 
						|
        result = concat(with_dummies, axis=1)
 | 
						|
    else:
 | 
						|
        result = _get_dummies_1d(
 | 
						|
            data,
 | 
						|
            prefix,
 | 
						|
            prefix_sep,
 | 
						|
            dummy_na,
 | 
						|
            sparse=sparse,
 | 
						|
            drop_first=drop_first,
 | 
						|
            dtype=dtype,
 | 
						|
        )
 | 
						|
    return result
 | 
						|
 | 
						|
 | 
						|
def _get_dummies_1d(
 | 
						|
    data,
 | 
						|
    prefix,
 | 
						|
    prefix_sep="_",
 | 
						|
    dummy_na: bool = False,
 | 
						|
    sparse: bool = False,
 | 
						|
    drop_first: bool = False,
 | 
						|
    dtype: Dtype | None = None,
 | 
						|
) -> DataFrame:
 | 
						|
    from pandas.core.reshape.concat import concat
 | 
						|
 | 
						|
    # Series avoids inconsistent NaN handling
 | 
						|
    codes, levels = factorize_from_iterable(Series(data))
 | 
						|
 | 
						|
    if dtype is None:
 | 
						|
        dtype = np.dtype(np.uint8)
 | 
						|
    # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
 | 
						|
    # dtype[Any], Type[object]]"; expected "Type[Any]"
 | 
						|
    dtype = np.dtype(dtype)  # type: ignore[arg-type]
 | 
						|
 | 
						|
    if is_object_dtype(dtype):
 | 
						|
        raise ValueError("dtype=object is not a valid dtype for get_dummies")
 | 
						|
 | 
						|
    def get_empty_frame(data) -> DataFrame:
 | 
						|
        index: Index | np.ndarray
 | 
						|
        if isinstance(data, Series):
 | 
						|
            index = data.index
 | 
						|
        else:
 | 
						|
            index = Index(range(len(data)))
 | 
						|
        return DataFrame(index=index)
 | 
						|
 | 
						|
    # if all NaN
 | 
						|
    if not dummy_na and len(levels) == 0:
 | 
						|
        return get_empty_frame(data)
 | 
						|
 | 
						|
    codes = codes.copy()
 | 
						|
    if dummy_na:
 | 
						|
        codes[codes == -1] = len(levels)
 | 
						|
        levels = levels.insert(len(levels), np.nan)
 | 
						|
 | 
						|
    # if dummy_na, we just fake a nan level. drop_first will drop it again
 | 
						|
    if drop_first and len(levels) == 1:
 | 
						|
        return get_empty_frame(data)
 | 
						|
 | 
						|
    number_of_cols = len(levels)
 | 
						|
 | 
						|
    if prefix is None:
 | 
						|
        dummy_cols = levels
 | 
						|
    else:
 | 
						|
        dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
 | 
						|
 | 
						|
    index: Index | None
 | 
						|
    if isinstance(data, Series):
 | 
						|
        index = data.index
 | 
						|
    else:
 | 
						|
        index = None
 | 
						|
 | 
						|
    if sparse:
 | 
						|
 | 
						|
        fill_value: bool | float | int
 | 
						|
        if is_integer_dtype(dtype):
 | 
						|
            fill_value = 0
 | 
						|
        elif dtype == np.dtype(bool):
 | 
						|
            fill_value = False
 | 
						|
        else:
 | 
						|
            fill_value = 0.0
 | 
						|
 | 
						|
        sparse_series = []
 | 
						|
        N = len(data)
 | 
						|
        sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
 | 
						|
        mask = codes != -1
 | 
						|
        codes = codes[mask]
 | 
						|
        n_idx = np.arange(N)[mask]
 | 
						|
 | 
						|
        for ndx, code in zip(n_idx, codes):
 | 
						|
            sp_indices[code].append(ndx)
 | 
						|
 | 
						|
        if drop_first:
 | 
						|
            # remove first categorical level to avoid perfect collinearity
 | 
						|
            # GH12042
 | 
						|
            sp_indices = sp_indices[1:]
 | 
						|
            dummy_cols = dummy_cols[1:]
 | 
						|
        for col, ixs in zip(dummy_cols, sp_indices):
 | 
						|
            sarr = SparseArray(
 | 
						|
                np.ones(len(ixs), dtype=dtype),
 | 
						|
                sparse_index=IntIndex(N, ixs),
 | 
						|
                fill_value=fill_value,
 | 
						|
                dtype=dtype,
 | 
						|
            )
 | 
						|
            sparse_series.append(Series(data=sarr, index=index, name=col))
 | 
						|
 | 
						|
        return concat(sparse_series, axis=1, copy=False)
 | 
						|
 | 
						|
    else:
 | 
						|
        # take on axis=1 + transpose to ensure ndarray layout is column-major
 | 
						|
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T
 | 
						|
 | 
						|
        if not dummy_na:
 | 
						|
            # reset NaN GH4446
 | 
						|
            dummy_mat[codes == -1] = 0
 | 
						|
 | 
						|
        if drop_first:
 | 
						|
            # remove first GH12042
 | 
						|
            dummy_mat = dummy_mat[:, 1:]
 | 
						|
            dummy_cols = dummy_cols[1:]
 | 
						|
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
 | 
						|
 | 
						|
 | 
						|
def _reorder_for_extension_array_stack(
 | 
						|
    arr: ExtensionArray, n_rows: int, n_columns: int
 | 
						|
) -> ExtensionArray:
 | 
						|
    """
 | 
						|
    Re-orders the values when stacking multiple extension-arrays.
 | 
						|
 | 
						|
    The indirect stacking method used for EAs requires a followup
 | 
						|
    take to get the order correct.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    arr : ExtensionArray
 | 
						|
    n_rows, n_columns : int
 | 
						|
        The number of rows and columns in the original DataFrame.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    taken : ExtensionArray
 | 
						|
        The original `arr` with elements re-ordered appropriately
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
 | 
						|
    >>> _reorder_for_extension_array_stack(arr, 2, 3)
 | 
						|
    array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
 | 
						|
 | 
						|
    >>> _reorder_for_extension_array_stack(arr, 3, 2)
 | 
						|
    array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
 | 
						|
    """
 | 
						|
    # final take to get the order correct.
 | 
						|
    # idx is an indexer like
 | 
						|
    # [c0r0, c1r0, c2r0, ...,
 | 
						|
    #  c0r1, c1r1, c2r1, ...]
 | 
						|
    idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
 | 
						|
    return arr.take(idx)
 |