647 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			647 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Quantilization functions and related stuff
 | 
						|
"""
 | 
						|
from __future__ import annotations
 | 
						|
 | 
						|
from typing import (
 | 
						|
    Any,
 | 
						|
    Callable,
 | 
						|
    Literal,
 | 
						|
)
 | 
						|
 | 
						|
import numpy as np
 | 
						|
 | 
						|
from pandas._libs import (
 | 
						|
    Timedelta,
 | 
						|
    Timestamp,
 | 
						|
)
 | 
						|
from pandas._libs.lib import infer_dtype
 | 
						|
 | 
						|
from pandas.core.dtypes.common import (
 | 
						|
    DT64NS_DTYPE,
 | 
						|
    ensure_platform_int,
 | 
						|
    is_bool_dtype,
 | 
						|
    is_categorical_dtype,
 | 
						|
    is_datetime64_dtype,
 | 
						|
    is_datetime64tz_dtype,
 | 
						|
    is_datetime_or_timedelta_dtype,
 | 
						|
    is_extension_array_dtype,
 | 
						|
    is_integer,
 | 
						|
    is_list_like,
 | 
						|
    is_numeric_dtype,
 | 
						|
    is_scalar,
 | 
						|
    is_timedelta64_dtype,
 | 
						|
)
 | 
						|
from pandas.core.dtypes.generic import ABCSeries
 | 
						|
from pandas.core.dtypes.missing import isna
 | 
						|
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    Index,
 | 
						|
    IntervalIndex,
 | 
						|
    to_datetime,
 | 
						|
    to_timedelta,
 | 
						|
)
 | 
						|
import pandas.core.algorithms as algos
 | 
						|
import pandas.core.nanops as nanops
 | 
						|
 | 
						|
 | 
						|
def cut(
 | 
						|
    x,
 | 
						|
    bins,
 | 
						|
    right: bool = True,
 | 
						|
    labels=None,
 | 
						|
    retbins: bool = False,
 | 
						|
    precision: int = 3,
 | 
						|
    include_lowest: bool = False,
 | 
						|
    duplicates: str = "raise",
 | 
						|
    ordered: bool = True,
 | 
						|
):
 | 
						|
    """
 | 
						|
    Bin values into discrete intervals.
 | 
						|
 | 
						|
    Use `cut` when you need to segment and sort data values into bins. This
 | 
						|
    function is also useful for going from a continuous variable to a
 | 
						|
    categorical variable. For example, `cut` could convert ages to groups of
 | 
						|
    age ranges. Supports binning into an equal number of bins, or a
 | 
						|
    pre-specified array of bins.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    x : array-like
 | 
						|
        The input array to be binned. Must be 1-dimensional.
 | 
						|
    bins : int, sequence of scalars, or IntervalIndex
 | 
						|
        The criteria to bin by.
 | 
						|
 | 
						|
        * int : Defines the number of equal-width bins in the range of `x`. The
 | 
						|
          range of `x` is extended by .1% on each side to include the minimum
 | 
						|
          and maximum values of `x`.
 | 
						|
        * sequence of scalars : Defines the bin edges allowing for non-uniform
 | 
						|
          width. No extension of the range of `x` is done.
 | 
						|
        * IntervalIndex : Defines the exact bins to be used. Note that
 | 
						|
          IntervalIndex for `bins` must be non-overlapping.
 | 
						|
 | 
						|
    right : bool, default True
 | 
						|
        Indicates whether `bins` includes the rightmost edge or not. If
 | 
						|
        ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
 | 
						|
        indicate (1,2], (2,3], (3,4]. This argument is ignored when
 | 
						|
        `bins` is an IntervalIndex.
 | 
						|
    labels : array or False, default None
 | 
						|
        Specifies the labels for the returned bins. Must be the same length as
 | 
						|
        the resulting bins. If False, returns only integer indicators of the
 | 
						|
        bins. This affects the type of the output container (see below).
 | 
						|
        This argument is ignored when `bins` is an IntervalIndex. If True,
 | 
						|
        raises an error. When `ordered=False`, labels must be provided.
 | 
						|
    retbins : bool, default False
 | 
						|
        Whether to return the bins or not. Useful when bins is provided
 | 
						|
        as a scalar.
 | 
						|
    precision : int, default 3
 | 
						|
        The precision at which to store and display the bins labels.
 | 
						|
    include_lowest : bool, default False
 | 
						|
        Whether the first interval should be left-inclusive or not.
 | 
						|
    duplicates : {default 'raise', 'drop'}, optional
 | 
						|
        If bin edges are not unique, raise ValueError or drop non-uniques.
 | 
						|
    ordered : bool, default True
 | 
						|
        Whether the labels are ordered or not. Applies to returned types
 | 
						|
        Categorical and Series (with Categorical dtype). If True,
 | 
						|
        the resulting categorical will be ordered. If False, the resulting
 | 
						|
        categorical will be unordered (labels must be provided).
 | 
						|
 | 
						|
        .. versionadded:: 1.1.0
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    out : Categorical, Series, or ndarray
 | 
						|
        An array-like object representing the respective bin for each value
 | 
						|
        of `x`. The type depends on the value of `labels`.
 | 
						|
 | 
						|
        * None (default) : returns a Series for Series `x` or a
 | 
						|
          Categorical for all other inputs. The values stored within
 | 
						|
          are Interval dtype.
 | 
						|
 | 
						|
        * sequence of scalars : returns a Series for Series `x` or a
 | 
						|
          Categorical for all other inputs. The values stored within
 | 
						|
          are whatever the type in the sequence is.
 | 
						|
 | 
						|
        * False : returns an ndarray of integers.
 | 
						|
 | 
						|
    bins : numpy.ndarray or IntervalIndex.
 | 
						|
        The computed or specified bins. Only returned when `retbins=True`.
 | 
						|
        For scalar or sequence `bins`, this is an ndarray with the computed
 | 
						|
        bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
 | 
						|
        an IntervalIndex `bins`, this is equal to `bins`.
 | 
						|
 | 
						|
    See Also
 | 
						|
    --------
 | 
						|
    qcut : Discretize variable into equal-sized buckets based on rank
 | 
						|
        or based on sample quantiles.
 | 
						|
    Categorical : Array type for storing data that come from a
 | 
						|
        fixed set of values.
 | 
						|
    Series : One-dimensional array with axis labels (including time series).
 | 
						|
    IntervalIndex : Immutable Index implementing an ordered, sliceable set.
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    Any NA values will be NA in the result. Out of bounds values will be NA in
 | 
						|
    the resulting Series or Categorical object.
 | 
						|
 | 
						|
    Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    Discretize into three equal-sized bins.
 | 
						|
 | 
						|
    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
 | 
						|
    Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
 | 
						|
 | 
						|
    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
 | 
						|
    Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
 | 
						|
    array([0.994, 3.   , 5.   , 7.   ]))
 | 
						|
 | 
						|
    Discovers the same bins, but assign them specific labels. Notice that
 | 
						|
    the returned Categorical's categories are `labels` and is ordered.
 | 
						|
 | 
						|
    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
 | 
						|
    ...        3, labels=["bad", "medium", "good"])
 | 
						|
    ['bad', 'good', 'medium', 'medium', 'good', 'bad']
 | 
						|
    Categories (3, object): ['bad' < 'medium' < 'good']
 | 
						|
 | 
						|
    ``ordered=False`` will result in unordered categories when labels are passed.
 | 
						|
    This parameter can be used to allow non-unique labels:
 | 
						|
 | 
						|
    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
 | 
						|
    ...        labels=["B", "A", "B"], ordered=False)
 | 
						|
    ['B', 'B', 'A', 'A', 'B', 'B']
 | 
						|
    Categories (2, object): ['A', 'B']
 | 
						|
 | 
						|
    ``labels=False`` implies you just want the bins back.
 | 
						|
 | 
						|
    >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
 | 
						|
    array([0, 1, 1, 3])
 | 
						|
 | 
						|
    Passing a Series as an input returns a Series with categorical dtype:
 | 
						|
 | 
						|
    >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
 | 
						|
    ...               index=['a', 'b', 'c', 'd', 'e'])
 | 
						|
    >>> pd.cut(s, 3)
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    a    (1.992, 4.667]
 | 
						|
    b    (1.992, 4.667]
 | 
						|
    c    (4.667, 7.333]
 | 
						|
    d     (7.333, 10.0]
 | 
						|
    e     (7.333, 10.0]
 | 
						|
    dtype: category
 | 
						|
    Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
 | 
						|
 | 
						|
    Passing a Series as an input returns a Series with mapping value.
 | 
						|
    It is used to map numerically to intervals based on bins.
 | 
						|
 | 
						|
    >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
 | 
						|
    ...               index=['a', 'b', 'c', 'd', 'e'])
 | 
						|
    >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    (a    1.0
 | 
						|
     b    2.0
 | 
						|
     c    3.0
 | 
						|
     d    4.0
 | 
						|
     e    NaN
 | 
						|
     dtype: float64,
 | 
						|
     array([ 0,  2,  4,  6,  8, 10]))
 | 
						|
 | 
						|
    Use `drop` optional when bins is not unique
 | 
						|
 | 
						|
    >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
 | 
						|
    ...        right=False, duplicates='drop')
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    (a    1.0
 | 
						|
     b    2.0
 | 
						|
     c    3.0
 | 
						|
     d    3.0
 | 
						|
     e    NaN
 | 
						|
     dtype: float64,
 | 
						|
     array([ 0,  2,  4,  6, 10]))
 | 
						|
 | 
						|
    Passing an IntervalIndex for `bins` results in those categories exactly.
 | 
						|
    Notice that values not covered by the IntervalIndex are set to NaN. 0
 | 
						|
    is to the left of the first bin (which is closed on the right), and 1.5
 | 
						|
    falls between two bins.
 | 
						|
 | 
						|
    >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
 | 
						|
    >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
 | 
						|
    [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
 | 
						|
    Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
 | 
						|
    """
 | 
						|
    # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 | 
						|
 | 
						|
    original = x
 | 
						|
    x = _preprocess_for_cut(x)
 | 
						|
    x, dtype = _coerce_to_type(x)
 | 
						|
 | 
						|
    if not np.iterable(bins):
 | 
						|
        if is_scalar(bins) and bins < 1:
 | 
						|
            raise ValueError("`bins` should be a positive integer.")
 | 
						|
 | 
						|
        try:  # for array-like
 | 
						|
            sz = x.size
 | 
						|
        except AttributeError:
 | 
						|
            x = np.asarray(x)
 | 
						|
            sz = x.size
 | 
						|
 | 
						|
        if sz == 0:
 | 
						|
            raise ValueError("Cannot cut empty array")
 | 
						|
 | 
						|
        rng = (nanops.nanmin(x), nanops.nanmax(x))
 | 
						|
        mn, mx = (mi + 0.0 for mi in rng)
 | 
						|
 | 
						|
        if np.isinf(mn) or np.isinf(mx):
 | 
						|
            # GH 24314
 | 
						|
            raise ValueError(
 | 
						|
                "cannot specify integer `bins` when input data contains infinity"
 | 
						|
            )
 | 
						|
        elif mn == mx:  # adjust end points before binning
 | 
						|
            mn -= 0.001 * abs(mn) if mn != 0 else 0.001
 | 
						|
            mx += 0.001 * abs(mx) if mx != 0 else 0.001
 | 
						|
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
 | 
						|
        else:  # adjust end points after binning
 | 
						|
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
 | 
						|
            adj = (mx - mn) * 0.001  # 0.1% of the range
 | 
						|
            if right:
 | 
						|
                bins[0] -= adj
 | 
						|
            else:
 | 
						|
                bins[-1] += adj
 | 
						|
 | 
						|
    elif isinstance(bins, IntervalIndex):
 | 
						|
        if bins.is_overlapping:
 | 
						|
            raise ValueError("Overlapping IntervalIndex is not accepted.")
 | 
						|
 | 
						|
    else:
 | 
						|
        if is_datetime64tz_dtype(bins):
 | 
						|
            bins = np.asarray(bins, dtype=DT64NS_DTYPE)
 | 
						|
        else:
 | 
						|
            bins = np.asarray(bins)
 | 
						|
        bins = _convert_bin_to_numeric_type(bins, dtype)
 | 
						|
 | 
						|
        # GH 26045: cast to float64 to avoid an overflow
 | 
						|
        if (np.diff(bins.astype("float64")) < 0).any():
 | 
						|
            raise ValueError("bins must increase monotonically.")
 | 
						|
 | 
						|
    fac, bins = _bins_to_cuts(
 | 
						|
        x,
 | 
						|
        bins,
 | 
						|
        right=right,
 | 
						|
        labels=labels,
 | 
						|
        precision=precision,
 | 
						|
        include_lowest=include_lowest,
 | 
						|
        dtype=dtype,
 | 
						|
        duplicates=duplicates,
 | 
						|
        ordered=ordered,
 | 
						|
    )
 | 
						|
 | 
						|
    return _postprocess_for_cut(fac, bins, retbins, dtype, original)
 | 
						|
 | 
						|
 | 
						|
def qcut(
 | 
						|
    x,
 | 
						|
    q,
 | 
						|
    labels=None,
 | 
						|
    retbins: bool = False,
 | 
						|
    precision: int = 3,
 | 
						|
    duplicates: str = "raise",
 | 
						|
):
 | 
						|
    """
 | 
						|
    Quantile-based discretization function.
 | 
						|
 | 
						|
    Discretize variable into equal-sized buckets based on rank or based
 | 
						|
    on sample quantiles. For example 1000 values for 10 quantiles would
 | 
						|
    produce a Categorical object indicating quantile membership for each data point.
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    x : 1d ndarray or Series
 | 
						|
    q : int or list-like of float
 | 
						|
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
 | 
						|
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
 | 
						|
    labels : array or False, default None
 | 
						|
        Used as labels for the resulting bins. Must be of the same length as
 | 
						|
        the resulting bins. If False, return only integer indicators of the
 | 
						|
        bins. If True, raises an error.
 | 
						|
    retbins : bool, optional
 | 
						|
        Whether to return the (bins, labels) or not. Can be useful if bins
 | 
						|
        is given as a scalar.
 | 
						|
    precision : int, optional
 | 
						|
        The precision at which to store and display the bins labels.
 | 
						|
    duplicates : {default 'raise', 'drop'}, optional
 | 
						|
        If bin edges are not unique, raise ValueError or drop non-uniques.
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    out : Categorical or Series or array of integers if labels is False
 | 
						|
        The return type (Categorical or Series) depends on the input: a Series
 | 
						|
        of type category if input is a Series else Categorical. Bins are
 | 
						|
        represented as categories when categorical data is returned.
 | 
						|
    bins : ndarray of floats
 | 
						|
        Returned only if `retbins` is True.
 | 
						|
 | 
						|
    Notes
 | 
						|
    -----
 | 
						|
    Out of bounds values will be NA in the resulting Categorical object
 | 
						|
 | 
						|
    Examples
 | 
						|
    --------
 | 
						|
    >>> pd.qcut(range(5), 4)
 | 
						|
    ... # doctest: +ELLIPSIS
 | 
						|
    [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
 | 
						|
    Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
 | 
						|
 | 
						|
    >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
 | 
						|
    ... # doctest: +SKIP
 | 
						|
    [good, good, medium, bad, bad]
 | 
						|
    Categories (3, object): [good < medium < bad]
 | 
						|
 | 
						|
    >>> pd.qcut(range(5), 4, labels=False)
 | 
						|
    array([0, 0, 1, 2, 3])
 | 
						|
    """
 | 
						|
    original = x
 | 
						|
    x = _preprocess_for_cut(x)
 | 
						|
    x, dtype = _coerce_to_type(x)
 | 
						|
 | 
						|
    quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
 | 
						|
 | 
						|
    x_np = np.asarray(x)
 | 
						|
    x_np = x_np[~np.isnan(x_np)]
 | 
						|
    bins = np.quantile(x_np, quantiles)
 | 
						|
 | 
						|
    fac, bins = _bins_to_cuts(
 | 
						|
        x,
 | 
						|
        bins,
 | 
						|
        labels=labels,
 | 
						|
        precision=precision,
 | 
						|
        include_lowest=True,
 | 
						|
        dtype=dtype,
 | 
						|
        duplicates=duplicates,
 | 
						|
    )
 | 
						|
 | 
						|
    return _postprocess_for_cut(fac, bins, retbins, dtype, original)
 | 
						|
 | 
						|
 | 
						|
def _bins_to_cuts(
 | 
						|
    x,
 | 
						|
    bins: np.ndarray,
 | 
						|
    right: bool = True,
 | 
						|
    labels=None,
 | 
						|
    precision: int = 3,
 | 
						|
    include_lowest: bool = False,
 | 
						|
    dtype=None,
 | 
						|
    duplicates: str = "raise",
 | 
						|
    ordered: bool = True,
 | 
						|
):
 | 
						|
    if not ordered and labels is None:
 | 
						|
        raise ValueError("'labels' must be provided if 'ordered = False'")
 | 
						|
 | 
						|
    if duplicates not in ["raise", "drop"]:
 | 
						|
        raise ValueError(
 | 
						|
            "invalid value for 'duplicates' parameter, valid options are: raise, drop"
 | 
						|
        )
 | 
						|
 | 
						|
    if isinstance(bins, IntervalIndex):
 | 
						|
        # we have a fast-path here
 | 
						|
        ids = bins.get_indexer(x)
 | 
						|
        result = Categorical.from_codes(ids, categories=bins, ordered=True)
 | 
						|
        return result, bins
 | 
						|
 | 
						|
    unique_bins = algos.unique(bins)
 | 
						|
    if len(unique_bins) < len(bins) and len(bins) != 2:
 | 
						|
        if duplicates == "raise":
 | 
						|
            raise ValueError(
 | 
						|
                f"Bin edges must be unique: {repr(bins)}.\n"
 | 
						|
                f"You can drop duplicate edges by setting the 'duplicates' kwarg"
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            bins = unique_bins
 | 
						|
 | 
						|
    side: Literal["left", "right"] = "left" if right else "right"
 | 
						|
    ids = ensure_platform_int(bins.searchsorted(x, side=side))
 | 
						|
 | 
						|
    if include_lowest:
 | 
						|
        ids[np.asarray(x) == bins[0]] = 1
 | 
						|
 | 
						|
    na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
 | 
						|
    has_nas = na_mask.any()
 | 
						|
 | 
						|
    if labels is not False:
 | 
						|
        if not (labels is None or is_list_like(labels)):
 | 
						|
            raise ValueError(
 | 
						|
                "Bin labels must either be False, None or passed in as a "
 | 
						|
                "list-like argument"
 | 
						|
            )
 | 
						|
 | 
						|
        elif labels is None:
 | 
						|
            labels = _format_labels(
 | 
						|
                bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
 | 
						|
            )
 | 
						|
        elif ordered and len(set(labels)) != len(labels):
 | 
						|
            raise ValueError(
 | 
						|
                "labels must be unique if ordered=True; pass ordered=False "
 | 
						|
                "for duplicate labels"
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            if len(labels) != len(bins) - 1:
 | 
						|
                raise ValueError(
 | 
						|
                    "Bin labels must be one fewer than the number of bin edges"
 | 
						|
                )
 | 
						|
        if not is_categorical_dtype(labels):
 | 
						|
            labels = Categorical(
 | 
						|
                labels,
 | 
						|
                categories=labels if len(set(labels)) == len(labels) else None,
 | 
						|
                ordered=ordered,
 | 
						|
            )
 | 
						|
        # TODO: handle mismatch between categorical label order and pandas.cut order.
 | 
						|
        np.putmask(ids, na_mask, 0)
 | 
						|
        result = algos.take_nd(labels, ids - 1)
 | 
						|
 | 
						|
    else:
 | 
						|
        result = ids - 1
 | 
						|
        if has_nas:
 | 
						|
            result = result.astype(np.float64)
 | 
						|
            np.putmask(result, na_mask, np.nan)
 | 
						|
 | 
						|
    return result, bins
 | 
						|
 | 
						|
 | 
						|
def _coerce_to_type(x):
 | 
						|
    """
 | 
						|
    if the passed data is of datetime/timedelta, bool or nullable int type,
 | 
						|
    this method converts it to numeric so that cut or qcut method can
 | 
						|
    handle it
 | 
						|
    """
 | 
						|
    dtype = None
 | 
						|
 | 
						|
    if is_datetime64tz_dtype(x.dtype):
 | 
						|
        dtype = x.dtype
 | 
						|
    elif is_datetime64_dtype(x.dtype):
 | 
						|
        x = to_datetime(x)
 | 
						|
        dtype = np.dtype("datetime64[ns]")
 | 
						|
    elif is_timedelta64_dtype(x.dtype):
 | 
						|
        x = to_timedelta(x)
 | 
						|
        dtype = np.dtype("timedelta64[ns]")
 | 
						|
    elif is_bool_dtype(x.dtype):
 | 
						|
        # GH 20303
 | 
						|
        x = x.astype(np.int64)
 | 
						|
    # To support cut and qcut for IntegerArray we convert to float dtype.
 | 
						|
    # Will properly support in the future.
 | 
						|
    # https://github.com/pandas-dev/pandas/pull/31290
 | 
						|
    # https://github.com/pandas-dev/pandas/issues/31389
 | 
						|
    elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype):
 | 
						|
        x = x.to_numpy(dtype=np.float64, na_value=np.nan)
 | 
						|
 | 
						|
    if dtype is not None:
 | 
						|
        # GH 19768: force NaT to NaN during integer conversion
 | 
						|
        x = np.where(x.notna(), x.view(np.int64), np.nan)
 | 
						|
 | 
						|
    return x, dtype
 | 
						|
 | 
						|
 | 
						|
def _convert_bin_to_numeric_type(bins, dtype):
 | 
						|
    """
 | 
						|
    if the passed bin is of datetime/timedelta type,
 | 
						|
    this method converts it to integer
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    bins : list-like of bins
 | 
						|
    dtype : dtype of data
 | 
						|
 | 
						|
    Raises
 | 
						|
    ------
 | 
						|
    ValueError if bins are not of a compat dtype to dtype
 | 
						|
    """
 | 
						|
    bins_dtype = infer_dtype(bins, skipna=False)
 | 
						|
    if is_timedelta64_dtype(dtype):
 | 
						|
        if bins_dtype in ["timedelta", "timedelta64"]:
 | 
						|
            bins = to_timedelta(bins).view(np.int64)
 | 
						|
        else:
 | 
						|
            raise ValueError("bins must be of timedelta64 dtype")
 | 
						|
    elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
 | 
						|
        if bins_dtype in ["datetime", "datetime64"]:
 | 
						|
            bins = to_datetime(bins).view(np.int64)
 | 
						|
        else:
 | 
						|
            raise ValueError("bins must be of datetime64 dtype")
 | 
						|
 | 
						|
    return bins
 | 
						|
 | 
						|
 | 
						|
def _convert_bin_to_datelike_type(bins, dtype):
 | 
						|
    """
 | 
						|
    Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
 | 
						|
    datelike
 | 
						|
 | 
						|
    Parameters
 | 
						|
    ----------
 | 
						|
    bins : list-like of bins
 | 
						|
    dtype : dtype of data
 | 
						|
 | 
						|
    Returns
 | 
						|
    -------
 | 
						|
    bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
 | 
						|
           datelike
 | 
						|
    """
 | 
						|
    if is_datetime64tz_dtype(dtype):
 | 
						|
        bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
 | 
						|
    elif is_datetime_or_timedelta_dtype(dtype):
 | 
						|
        bins = Index(bins.astype(np.int64), dtype=dtype)
 | 
						|
    return bins
 | 
						|
 | 
						|
 | 
						|
def _format_labels(
 | 
						|
    bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
 | 
						|
):
 | 
						|
    """based on the dtype, return our labels"""
 | 
						|
    closed = "right" if right else "left"
 | 
						|
 | 
						|
    formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
 | 
						|
 | 
						|
    if is_datetime64tz_dtype(dtype):
 | 
						|
        formatter = lambda x: Timestamp(x, tz=dtype.tz)
 | 
						|
        adjust = lambda x: x - Timedelta("1ns")
 | 
						|
    elif is_datetime64_dtype(dtype):
 | 
						|
        formatter = Timestamp
 | 
						|
        adjust = lambda x: x - Timedelta("1ns")
 | 
						|
    elif is_timedelta64_dtype(dtype):
 | 
						|
        formatter = Timedelta
 | 
						|
        adjust = lambda x: x - Timedelta("1ns")
 | 
						|
    else:
 | 
						|
        precision = _infer_precision(precision, bins)
 | 
						|
        formatter = lambda x: _round_frac(x, precision)
 | 
						|
        adjust = lambda x: x - 10 ** (-precision)
 | 
						|
 | 
						|
    breaks = [formatter(b) for b in bins]
 | 
						|
    if right and include_lowest:
 | 
						|
        # adjust lhs of first interval by precision to account for being right closed
 | 
						|
        breaks[0] = adjust(breaks[0])
 | 
						|
 | 
						|
    return IntervalIndex.from_breaks(breaks, closed=closed)
 | 
						|
 | 
						|
 | 
						|
def _preprocess_for_cut(x):
 | 
						|
    """
 | 
						|
    handles preprocessing for cut where we convert passed
 | 
						|
    input to array, strip the index information and store it
 | 
						|
    separately
 | 
						|
    """
 | 
						|
    # Check that the passed array is a Pandas or Numpy object
 | 
						|
    # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
 | 
						|
    ndim = getattr(x, "ndim", None)
 | 
						|
    if ndim is None:
 | 
						|
        x = np.asarray(x)
 | 
						|
    if x.ndim != 1:
 | 
						|
        raise ValueError("Input array must be 1 dimensional")
 | 
						|
 | 
						|
    return x
 | 
						|
 | 
						|
 | 
						|
def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):
 | 
						|
    """
 | 
						|
    handles post processing for the cut method where
 | 
						|
    we combine the index information if the originally passed
 | 
						|
    datatype was a series
 | 
						|
    """
 | 
						|
    if isinstance(original, ABCSeries):
 | 
						|
        fac = original._constructor(fac, index=original.index, name=original.name)
 | 
						|
 | 
						|
    if not retbins:
 | 
						|
        return fac
 | 
						|
 | 
						|
    bins = _convert_bin_to_datelike_type(bins, dtype)
 | 
						|
 | 
						|
    return fac, bins
 | 
						|
 | 
						|
 | 
						|
def _round_frac(x, precision: int):
 | 
						|
    """
 | 
						|
    Round the fractional part of the given number
 | 
						|
    """
 | 
						|
    if not np.isfinite(x) or x == 0:
 | 
						|
        return x
 | 
						|
    else:
 | 
						|
        frac, whole = np.modf(x)
 | 
						|
        if whole == 0:
 | 
						|
            digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
 | 
						|
        else:
 | 
						|
            digits = precision
 | 
						|
        return np.around(x, digits)
 | 
						|
 | 
						|
 | 
						|
def _infer_precision(base_precision: int, bins) -> int:
 | 
						|
    """
 | 
						|
    Infer an appropriate precision for _round_frac
 | 
						|
    """
 | 
						|
    for precision in range(base_precision, 20):
 | 
						|
        levels = [_round_frac(b, precision) for b in bins]
 | 
						|
        if algos.unique(levels).size == bins.size:
 | 
						|
            return precision
 | 
						|
    return base_precision  # default
 |