Files
Feature-Extraction/dist/client/pandas/core/internals/blocks.py

2259 lines
72 KiB
Python

from __future__ import annotations
from functools import wraps
import re
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterable,
Sequence,
cast,
final,
)
import warnings
import numpy as np
from pandas._libs import (
Timestamp,
algos as libalgos,
internals as libinternals,
lib,
writers,
)
from pandas._libs.internals import BlockPlacement
from pandas._typing import (
ArrayLike,
DtypeObj,
F,
Shape,
npt,
)
from pandas.compat import np_version_under1p20
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
astype_array_safe,
can_hold_element,
find_common_type,
infer_dtype_from,
maybe_downcast_numeric,
maybe_downcast_to_dtype,
maybe_upcast,
soft_convert_objects,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_1d_only_ea_obj,
is_dtype_equal,
is_extension_array_dtype,
is_interval_dtype,
is_list_like,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
ExtensionDtype,
PandasDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCIndex,
ABCPandasArray,
ABCSeries,
)
from pandas.core.dtypes.inference import is_inferred_bool_dtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
na_value_for_dtype,
)
import pandas.core.algorithms as algos
from pandas.core.array_algos.putmask import (
extract_bool_array,
putmask_inplace,
putmask_smart,
putmask_without_repeat,
setitem_datetimelike_compat,
validate_putmask,
)
from pandas.core.array_algos.quantile import quantile_compat
from pandas.core.array_algos.replace import (
compare_or_regex_search,
replace_regex,
should_use_regex,
)
from pandas.core.array_algos.take import take_nd
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
IntervalArray,
PandasArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
import pandas.core.computation.expressions as expressions
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
)
from pandas.core.indexers import (
check_setitem_lengths,
is_empty_indexer,
is_scalar_indexer,
)
import pandas.core.missing as missing
if TYPE_CHECKING:
from pandas import (
Float64Index,
Index,
)
# comparison is faster than is_object_dtype
_dtype_obj = np.dtype("object")
def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
use the original method.
"""
@wraps(meth)
def newfunc(self, *args, **kwargs) -> list[Block]:
if self.ndim == 1 or self.shape[0] == 1:
return meth(self, *args, **kwargs)
else:
# Split and operate column-by-column
return self.split_and_operate(meth, *args, **kwargs)
return cast(F, newfunc)
class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
data structure
Index-ignorant; let the container take care of that
"""
values: np.ndarray | ExtensionArray
ndim: int
__init__: Callable
__slots__ = ()
is_numeric = False
is_object = False
is_extension = False
_can_consolidate = True
_validate_ndim = True
@final
@cache_readonly
def _consolidate_key(self):
return self._can_consolidate, self.dtype.name
@property
def is_view(self) -> bool:
"""return a boolean if I am possibly a view"""
values = self.values
values = cast(np.ndarray, values)
return values.base is not None
@final
@cache_readonly
def _can_hold_na(self) -> bool:
"""
Can we store NA values in this Block?
"""
dtype = self.dtype
if isinstance(dtype, np.dtype):
return dtype.kind not in ["b", "i", "u"]
return dtype._can_hold_na
@final
@cache_readonly
def is_categorical(self) -> bool:
warnings.warn(
"Block.is_categorical is deprecated and will be removed in a "
"future version. Use isinstance(block.values, Categorical) "
"instead. See https://github.com/pandas-dev/pandas/issues/40226",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return isinstance(self.values, Categorical)
@final
@property
def is_bool(self) -> bool:
"""
We can be bool if a) we are bool dtype or b) object dtype with bool objects.
"""
return is_inferred_bool_dtype(self.values)
@final
def external_values(self):
return external_values(self.values)
@property
def array_values(self) -> ExtensionArray:
"""
The array that Series.array returns. Always an ExtensionArray.
"""
# error: Argument 1 to "PandasArray" has incompatible type "Union[ndarray,
# ExtensionArray]"; expected "Union[ndarray, PandasArray]"
return PandasArray(self.values) # type: ignore[arg-type]
def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
"""
return an internal format, currently just the ndarray
this is often overridden to handle to_dense like operations
"""
if dtype == _dtype_obj:
return self.values.astype(_dtype_obj)
# error: Incompatible return value type (got "Union[ndarray, ExtensionArray]",
# expected "ndarray")
return self.values # type: ignore[return-value]
def values_for_json(self) -> np.ndarray:
# Incompatible return value type (got "Union[ndarray[Any, Any],
# ExtensionArray]", expected "ndarray[Any, Any]")
return self.values # type: ignore[return-value]
@final
@cache_readonly
def fill_value(self):
# Used in reindex_indexer
return na_value_for_dtype(self.dtype, compat=False)
@property
def mgr_locs(self) -> BlockPlacement:
return self._mgr_locs
@mgr_locs.setter
def mgr_locs(self, new_mgr_locs: BlockPlacement):
self._mgr_locs = new_mgr_locs
@final
def make_block(self, values, placement=None) -> Block:
"""
Create a new block, with type inference propagate any values that are
not specified
"""
if placement is None:
placement = self._mgr_locs
if self.is_extension:
values = ensure_block_shape(values, ndim=self.ndim)
# TODO: perf by not going through new_block
# We assume maybe_coerce_values has already been called
return new_block(values, placement=placement, ndim=self.ndim)
@final
def make_block_same_class(
self, values, placement: BlockPlacement | None = None
) -> Block:
"""Wrap given values in a block of same type as self."""
if placement is None:
placement = self._mgr_locs
if values.dtype.kind in ["m", "M"]:
new_values = ensure_wrapped_if_datetimelike(values)
if new_values is not values:
# TODO(2.0): remove once fastparquet has stopped relying on it
warnings.warn(
"In a future version, Block.make_block_same_class will "
"assume that datetime64 and timedelta64 ndarrays have "
"already been cast to DatetimeArray and TimedeltaArray, "
"respectively.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
values = new_values
# We assume maybe_coerce_values has already been called
return type(self)(values, placement=placement, ndim=self.ndim)
@final
def __repr__(self) -> str:
# don't want to print out all of the items here
name = type(self).__name__
if self.ndim == 1:
result = f"{name}: {len(self)} dtype: {self.dtype}"
else:
shape = " x ".join([str(s) for s in self.shape])
result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}"
return result
@final
def __len__(self) -> int:
return len(self.values)
def _slice(self, slicer) -> ArrayLike:
"""return a slice of my values"""
return self.values[slicer]
@final
def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block:
"""
Perform __getitem__-like, return result as block.
Only supports slices that preserve dimensionality.
"""
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer
new_mgr_locs = self._mgr_locs[axis0_slicer]
new_values = self._slice(slicer)
if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")
return type(self)(new_values, new_mgr_locs, self.ndim)
@final
def getitem_block_columns(
self, slicer: slice, new_mgr_locs: BlockPlacement
) -> Block:
"""
Perform __getitem__-like, return result as block.
Only supports slices that preserve dimensionality.
"""
new_values = self._slice(slicer)
if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")
return type(self)(new_values, new_mgr_locs, self.ndim)
# NB: this cannot be made cache_readonly because in libreduction we pin
# new .values that can have different shape GH#42631
@property
def shape(self) -> Shape:
return self.values.shape
@cache_readonly
def dtype(self) -> DtypeObj:
return self.values.dtype
def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
# In the case where we have a tuple[slice, int], the slice will always
# be slice(None)
# Note: only reached with self.ndim == 2
# Invalid index type "Union[int, Tuple[int, int], Tuple[slice, int]]"
# for "Union[ndarray[Any, Any], ExtensionArray]"; expected type
# "Union[int, integer[Any]]"
return self.values[i] # type: ignore[index]
def set_inplace(self, locs, values) -> None:
"""
Modify block values in-place with new item value.
Notes
-----
`set` never creates a new array or new Block, whereas `setitem` _may_
create a new array and always creates a new Block.
"""
self.values[locs] = values
def delete(self, loc) -> None:
"""
Delete given loc(-s) from block in-place.
"""
# Argument 1 to "delete" has incompatible type "Union[ndarray[Any, Any],
# ExtensionArray]"; expected "Union[_SupportsArray[dtype[Any]],
# Sequence[_SupportsArray[dtype[Any]]], Sequence[Sequence
# [_SupportsArray[dtype[Any]]]], Sequence[Sequence[Sequence[
# _SupportsArray[dtype[Any]]]]], Sequence[Sequence[Sequence[Sequence[
# _SupportsArray[dtype[Any]]]]]]]" [arg-type]
self.values = np.delete(self.values, loc, 0) # type: ignore[arg-type]
self.mgr_locs = self._mgr_locs.delete(loc)
try:
self._cache.clear()
except AttributeError:
# _cache not yet initialized
pass
@final
def apply(self, func, **kwargs) -> list[Block]:
"""
apply the function to my values; return a block if we are not
one
"""
result = func(self.values, **kwargs)
return self._split_op_result(result)
def reduce(self, func, ignore_failures: bool = False) -> list[Block]:
# We will apply the function and reshape the result into a single-row
# Block with the same mgr_locs; squeezing will be done at a higher level
assert self.ndim == 2
try:
result = func(self.values)
except (TypeError, NotImplementedError):
if ignore_failures:
return []
raise
if self.values.ndim == 1:
# TODO(EA2D): special case not needed with 2D EAs
res_values = np.array([[result]])
else:
res_values = result.reshape(-1, 1)
nb = self.make_block(res_values)
return [nb]
@final
def _split_op_result(self, result: ArrayLike) -> list[Block]:
# See also: split_and_operate
if result.ndim > 1 and isinstance(result.dtype, ExtensionDtype):
# TODO(EA2D): unnecessary with 2D EAs
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self._mgr_locs):
if not is_1d_only_ea_obj(result):
vals = result[i : i + 1]
else:
vals = result[i]
block = self.make_block(values=vals, placement=loc)
nbs.append(block)
return nbs
nb = self.make_block(result)
return [nb]
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:
"""
fillna on the block with the value. If we fail, then convert to
ObjectBlock and try again
"""
inplace = validate_bool_kwarg(inplace, "inplace")
mask = isna(self.values)
mask, noop = validate_putmask(self.values, mask)
if limit is not None:
limit = libalgos.validate_limit(None, limit=limit)
mask[mask.cumsum(self.ndim - 1) > limit] = False
if not self._can_hold_na:
if inplace:
return [self]
else:
return [self.copy()]
if self._can_hold_element(value):
nb = self if inplace else self.copy()
putmask_inplace(nb.values, mask, value)
return nb._maybe_downcast([nb], downcast)
if noop:
# we can't process the value, but nothing to do
return [self] if inplace else [self.copy()]
elif self.ndim == 1 or self.shape[0] == 1:
blk = self.coerce_to_target_dtype(value)
# bc we have already cast, inplace=True may avoid an extra copy
return blk.fillna(value, limit=limit, inplace=True, downcast=None)
else:
# operate column-by-column
return self.split_and_operate(
type(self).fillna, value, limit=limit, inplace=inplace, downcast=None
)
@final
def _split(self) -> list[Block]:
"""
Split a block into a list of single-column blocks.
"""
assert self.ndim == 2
new_blocks = []
for i, ref_loc in enumerate(self._mgr_locs):
vals = self.values[slice(i, i + 1)]
bp = BlockPlacement(ref_loc)
nb = type(self)(vals, placement=bp, ndim=2)
new_blocks.append(nb)
return new_blocks
@final
def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
"""
Split the block and apply func column-by-column.
Parameters
----------
func : Block method
*args
**kwargs
Returns
-------
List[Block]
"""
assert self.ndim == 2 and self.shape[0] != 1
res_blocks = []
for nb in self._split():
rbs = func(nb, *args, **kwargs)
res_blocks.extend(rbs)
return res_blocks
@final
def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]:
if downcast is False:
return blocks
if self.dtype == _dtype_obj:
# GH#44241 We downcast regardless of the argument;
# respecting 'downcast=None' may be worthwhile at some point,
# but ATM it breaks too much existing code.
# split and convert the blocks
return extend_blocks(
[blk.convert(datetime=True, numeric=False) for blk in blocks]
)
if downcast is None:
return blocks
return extend_blocks([b._downcast_2d(downcast) for b in blocks])
@final
@maybe_split
def _downcast_2d(self, dtype) -> list[Block]:
"""
downcast specialized to 2D case post-validation.
Refactored to allow use of maybe_split.
"""
new_values = maybe_downcast_to_dtype(self.values, dtype=dtype)
return [self.make_block(new_values)]
@final
def astype(self, dtype: DtypeObj, copy: bool = False, errors: str = "raise"):
"""
Coerce to the new dtype.
Parameters
----------
dtype : np.dtype or ExtensionDtype
copy : bool, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
Returns
-------
Block
"""
values = self.values
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
new_values = maybe_coerce_values(new_values)
newb = self.make_block(new_values)
if newb.shape != self.shape:
raise TypeError(
f"cannot set astype for copy = [{copy}] for dtype "
f"({self.dtype.name} [{self.shape}]) to different shape "
f"({newb.dtype.name} [{newb.shape}])"
)
return newb
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
) -> list[Block]:
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
return [self.copy()] if copy else [self]
@final
def _can_hold_element(self, element: Any) -> bool:
"""require the same dtype as ourselves"""
element = extract_array(element, extract_numpy=True)
return can_hold_element(self.values, element)
@final
def should_store(self, value: ArrayLike) -> bool:
"""
Should we set self.values[indexer] = value inplace or do we need to cast?
Parameters
----------
value : np.ndarray or ExtensionArray
Returns
-------
bool
"""
# faster equivalent to is_dtype_equal(value.dtype, self.dtype)
try:
return value.dtype == self.dtype
except TypeError:
return False
@final
def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
"""convert to our native types format"""
result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs)
return self.make_block(result)
# block actions #
@final
def copy(self, deep: bool = True):
"""copy constructor"""
values = self.values
if deep:
values = values.copy()
return type(self)(values, placement=self._mgr_locs, ndim=self.ndim)
# ---------------------------------------------------------------------
# Replace
@final
def replace(
self,
to_replace,
value,
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask.
"""
# Note: the checks we do in NDFrame.replace ensure we never get
# here with listlike to_replace or value, as those cases
# go through replace_list
values = self.values
if isinstance(values, Categorical):
# TODO: avoid special-casing
blk = self if inplace else self.copy()
blk.values._replace(to_replace=to_replace, value=value, inplace=True)
return [blk]
if not self._can_hold_element(to_replace):
# We cannot hold `to_replace`, so we know immediately that
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
return [self] if inplace else [self.copy()]
if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]
elif self._can_hold_element(value):
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
# may downcast to NaN
blocks = blk.convert(numeric=False, copy=False)
else:
blocks = [blk]
return blocks
elif self.ndim == 1 or self.shape[0] == 1:
blk = self.coerce_to_target_dtype(value)
return blk.replace(
to_replace=to_replace,
value=value,
inplace=True,
mask=mask,
)
else:
# split so that we only upcast where necessary
blocks = []
for i, nb in enumerate(self._split()):
blocks.extend(
type(self).replace(
nb,
to_replace=to_replace,
value=value,
inplace=True,
mask=mask[i : i + 1],
)
)
return blocks
@final
def _replace_regex(
self,
to_replace,
value,
inplace: bool = False,
convert: bool = True,
mask=None,
) -> list[Block]:
"""
Replace elements by the given value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
convert : bool, default True
If true, try to coerce any object types to better types.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
Returns
-------
List[Block]
"""
if not self._can_hold_element(to_replace):
# i.e. only ObjectBlock, but could in principle include a
# String ExtensionBlock
return [self] if inplace else [self.copy()]
rx = re.compile(to_replace)
new_values = self.values if inplace else self.values.copy()
replace_regex(new_values, rx, value, mask)
block = self.make_block(new_values)
return block.convert(numeric=False, copy=False)
@final
def replace_list(
self,
src_list: Iterable[Any],
dest_list: Sequence[Any],
inplace: bool = False,
regex: bool = False,
) -> list[Block]:
"""
See BlockManager.replace_list docstring.
"""
values = self.values
# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]
src_len = len(pairs) - 1
if is_string_dtype(values.dtype):
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)
masks = [
compare_or_regex_search(values, s[0], regex=regex, mask=mask)
for s in pairs
]
else:
# GH#38086 faster if we know we dont need to check for regex
masks = [missing.mask_missing(values, s[0]) for s in pairs]
# error: Argument 1 to "extract_bool_array" has incompatible type
# "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
# ndarray]"
masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type]
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
convert = i == src_len # only convert once at the end
new_rb: list[Block] = []
# GH-39338: _replace_coerce can split a block into
# single-column blocks, so track the index so we know
# where to index into the mask
for blk_num, blk in enumerate(rb):
if len(rb) == 1:
m = masks[i]
else:
mib = masks[i]
assert not isinstance(mib, bool)
m = mib[blk_num : blk_num + 1]
result = blk._replace_coerce(
to_replace=src,
value=dest,
mask=m,
inplace=inplace,
regex=regex,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks(
[b.convert(numeric=False, copy=True) for b in result]
)
new_rb.extend(result)
rb = new_rb
return rb
@final
def _replace_coerce(
self,
to_replace,
value,
mask: np.ndarray,
inplace: bool = True,
regex: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
mask : np.ndarray[bool]
True indicate corresponding element is ignored.
inplace : bool, default True
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
Returns
-------
List[Block]
"""
if should_use_regex(regex, to_replace):
return self._replace_regex(
to_replace,
value,
inplace=inplace,
convert=False,
mask=mask,
)
else:
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
if nb is self and not inplace:
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace, value=value, inplace=inplace, mask=mask
)
# ---------------------------------------------------------------------
def _maybe_squeeze_arg(self, arg: np.ndarray) -> np.ndarray:
"""
For compatibility with 1D-only ExtensionArrays.
"""
return arg
def setitem(self, indexer, value):
"""
Attempt self.values[indexer] = value, possibly creating a new array.
Parameters
----------
indexer : tuple, list-like, array-like, slice, int
The subset of self.values to set
value : object
The value being set
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
transpose = self.ndim == 2
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")
# coerce None values, if appropriate
if value is None:
if self.is_numeric:
value = np.nan
# coerce if block dtype can store value
values = cast(np.ndarray, self.values)
if not self._can_hold_element(value):
# current dtype cannot store value, coerce to common dtype
return self.coerce_to_target_dtype(value).setitem(indexer, value)
# value must be storable at this moment
if is_extension_array_dtype(getattr(value, "dtype", None)):
# We need to be careful not to allow through strings that
# can be parsed to EADtypes
arr_value = value
else:
arr_value = np.asarray(value)
if transpose:
values = values.T
# length checking
check_setitem_lengths(indexer, value, values)
if is_empty_indexer(indexer, arr_value):
# GH#8669 empty indexers, test_loc_setitem_boolean_mask_allfalse
pass
elif is_scalar_indexer(indexer, self.ndim):
# setting a single element for each dim and with a rhs that could
# be e.g. a list; see GH#6043
values[indexer] = value
else:
value = setitem_datetimelike_compat(values, len(values[indexer]), value)
values[indexer] = value
return self
def putmask(self, mask, new) -> list[Block]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Return the resulting block(s).
Parameters
----------
mask : np.ndarray[bool], SparseArray[bool], or BooleanArray
new : a ndarray/object
Returns
-------
List[Block]
"""
orig_mask = mask
values = cast(np.ndarray, self.values)
mask, noop = validate_putmask(values.T, mask)
assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame))
if new is lib.no_default:
new = self.fill_value
# if we are passed a scalar None, convert it here
if not self.is_object and is_valid_na_for_dtype(new, self.dtype):
new = self.fill_value
if self._can_hold_element(new):
putmask_without_repeat(values.T, mask, new)
return [self]
elif np_version_under1p20 and infer_dtype_from(new)[0].kind in ["m", "M"]:
# using putmask with object dtype will incorrectly cast to object
# Having excluded self._can_hold_element, we know we cannot operate
# in-place, so we are safe using `where`
return self.where(new, ~mask)
elif noop:
return [self]
elif self.ndim == 1 or self.shape[0] == 1:
# no need to split columns
if not is_list_like(new):
# putmask_smart can't save us the need to cast
return self.coerce_to_target_dtype(new).putmask(mask, new)
# This differs from
# `self.coerce_to_target_dtype(new).putmask(mask, new)`
# because putmask_smart will check if new[mask] may be held
# by our dtype.
nv = putmask_smart(values.T, mask, new).T
return [self.make_block(nv)]
else:
is_array = isinstance(new, np.ndarray)
res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
n = new
if is_array:
# we have a different value per-column
n = new[:, i : i + 1]
submask = orig_mask[:, i : i + 1]
rbs = nb.putmask(submask, n)
res_blocks.extend(rbs)
return res_blocks
@final
def coerce_to_target_dtype(self, other) -> Block:
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
we can also safely try to coerce to the same dtype
and will receive the same block
"""
# if we cannot then coerce to object
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
new_dtype = find_common_type([self.dtype, dtype])
return self.astype(new_dtype, copy=False)
def interpolate(
self,
method: str = "pad",
axis: int = 0,
index: Index | None = None,
inplace: bool = False,
limit: int | None = None,
limit_direction: str = "forward",
limit_area: str | None = None,
fill_value: Any | None = None,
coerce: bool = False,
downcast: str | None = None,
**kwargs,
) -> list[Block]:
inplace = validate_bool_kwarg(inplace, "inplace")
if not self._can_hold_na:
# If there are no NAs, then interpolate is a no-op
return [self] if inplace else [self.copy()]
if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0:
# split improves performance in ndarray.copy()
return self.split_and_operate(
type(self).interpolate,
method,
axis,
index,
inplace,
limit,
limit_direction,
limit_area,
fill_value,
coerce,
downcast,
**kwargs,
)
try:
m = missing.clean_fill_method(method)
except ValueError:
m = None
if m is None and self.dtype.kind != "f":
# only deal with floats
# bc we already checked that can_hold_na, we dont have int dtype here
# TODO: make a copy if not inplace?
return [self]
data = self.values if inplace else self.values.copy()
data = cast(np.ndarray, data) # bc overridden by ExtensionBlock
missing.interpolate_array_2d(
data,
method=method,
axis=axis,
index=index,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
**kwargs,
)
nb = self.make_block_same_class(data)
return nb._maybe_downcast([nb], downcast)
def take_nd(
self,
indexer,
axis: int,
new_mgr_locs: BlockPlacement | None = None,
fill_value=lib.no_default,
) -> Block:
"""
Take values according to indexer and return them as a block.bb
"""
# algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock
# so need to preserve types
# sparse is treated like an ndarray, but needs .get_values() shaping
values = self.values
if fill_value is lib.no_default:
fill_value = self.fill_value
allow_fill = False
else:
allow_fill = True
new_values = algos.take_nd(
values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
)
# Called from three places in managers, all of which satisfy
# this assertion
assert not (axis == 0 and new_mgr_locs is None)
if new_mgr_locs is None:
new_mgr_locs = self._mgr_locs
if not is_dtype_equal(new_values.dtype, self.dtype):
return self.make_block(new_values, new_mgr_locs)
else:
return self.make_block_same_class(new_values, new_mgr_locs)
def diff(self, n: int, axis: int = 1) -> list[Block]:
"""return block for the diff of the values"""
new_values = algos.diff(self.values, n, axis=axis)
return [self.make_block(values=new_values)]
def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]:
"""shift the block by periods, possibly upcast"""
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
values = cast(np.ndarray, self.values)
new_values, fill_value = maybe_upcast(values, fill_value)
new_values = shift(new_values, periods, axis, fill_value)
return [self.make_block(new_values)]
def where(self, other, cond) -> list[Block]:
"""
evaluate the block; return result block(s) from the result
Parameters
----------
other : a ndarray/object
cond : np.ndarray[bool], SparseArray[bool], or BooleanArray
Returns
-------
List[Block]
"""
assert cond.ndim == self.ndim
assert not isinstance(other, (ABCIndex, ABCSeries, ABCDataFrame))
transpose = self.ndim == 2
# EABlocks override where
values = cast(np.ndarray, self.values)
orig_other = other
if transpose:
values = values.T
icond, noop = validate_putmask(values, ~cond)
if noop:
# GH-39595: Always return a copy; short-circuit up/downcasting
return self.copy()
if other is lib.no_default:
other = self.fill_value
if is_valid_na_for_dtype(other, self.dtype) and self.dtype != _dtype_obj:
other = self.fill_value
if not self._can_hold_element(other):
# we cannot coerce, return a compat dtype
block = self.coerce_to_target_dtype(other)
blocks = block.where(orig_other, cond)
return self._maybe_downcast(blocks, "infer")
else:
alt = setitem_datetimelike_compat(values, icond.sum(), other)
if alt is not other:
if is_list_like(other) and len(other) < len(values):
# call np.where with other to get the appropriate ValueError
np.where(~icond, values, other)
raise NotImplementedError(
"This should not be reached; call to np.where above is "
"expected to raise ValueError. Please report a bug at "
"github.com/pandas-dev/pandas"
)
result = values.copy()
np.putmask(result, icond, alt)
else:
# By the time we get here, we should have all Series/Index
# args extracted to ndarray
if (
is_list_like(other)
and not isinstance(other, np.ndarray)
and len(other) == self.shape[-1]
):
# If we don't do this broadcasting here, then expressions.where
# will broadcast a 1D other to be row-like instead of
# column-like.
other = np.array(other).reshape(values.shape)
# If lengths don't match (or len(other)==1), we will raise
# inside expressions.where, see test_series_where
# Note: expressions.where may upcast.
result = expressions.where(~icond, values, other)
if self._can_hold_na or self.ndim == 1:
if transpose:
result = result.T
return [self.make_block(result)]
# might need to separate out blocks
cond = ~icond
axis = cond.ndim - 1
cond = cond.swapaxes(axis, 0)
mask = cond.all(axis=1)
result_blocks: list[Block] = []
for m in [mask, ~mask]:
if m.any():
taken = result.take(m.nonzero()[0], axis=axis)
r = maybe_downcast_numeric(taken, self.dtype)
if r.dtype != taken.dtype:
warnings.warn(
"Downcasting integer-dtype results in .where is "
"deprecated and will change in a future version. "
"To retain the old behavior, explicitly cast the results "
"to the desired dtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
nb = self.make_block(r.T, placement=self._mgr_locs[m])
result_blocks.append(nb)
return result_blocks
def _unstack(
self,
unstacker,
fill_value,
new_placement: npt.NDArray[np.intp],
needs_masking: npt.NDArray[np.bool_],
):
"""
Return a list of unstacked blocks of self
Parameters
----------
unstacker : reshape._Unstacker
fill_value : int
Only used in ExtensionBlock._unstack
new_placement : np.ndarray[np.intp]
allow_fill : bool
needs_masking : np.ndarray[bool]
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array-like of bool
The mask of columns of `blocks` we should keep.
"""
new_values, mask = unstacker.get_new_values(
self.values.T, fill_value=fill_value
)
mask = mask.any(0)
# TODO: in all tests we have mask.all(); can we rely on that?
# Note: these next two lines ensure that
# mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
# which the calling function needs in order to pass verify_integrity=False
# to the BlockManager constructor
new_values = new_values.T[mask]
new_placement = new_placement[mask]
bp = BlockPlacement(new_placement)
blocks = [new_block_2d(new_values, placement=bp)]
return blocks, mask
@final
def quantile(
self, qs: Float64Index, interpolation="linear", axis: int = 0
) -> Block:
"""
compute the quantiles of the
Parameters
----------
qs : Float64Index
List of the quantiles to be computed.
interpolation : str, default 'linear'
Type of interpolation.
axis : int, default 0
Axis to compute.
Returns
-------
Block
"""
# We should always have ndim == 2 because Series dispatches to DataFrame
assert self.ndim == 2
assert axis == 1 # only ever called this way
assert is_list_like(qs) # caller is responsible for this
result = quantile_compat(self.values, np.asarray(qs._values), interpolation)
# ensure_block_shape needed for cases where we start with EA and result
# is ndarray, e.g. IntegerArray, SparseArray
result = ensure_block_shape(result, ndim=2)
return new_block_2d(result, placement=self._mgr_locs)
class EABackedBlock(Block):
"""
Mixin for Block subclasses backed by ExtensionArray.
"""
values: ExtensionArray
def where(self, other, cond) -> list[Block]:
arr = self.values.T
cond = extract_bool_array(cond)
other = self._maybe_squeeze_arg(other)
cond = self._maybe_squeeze_arg(cond)
if other is lib.no_default:
other = self.fill_value
icond, noop = validate_putmask(arr, ~cond)
if noop:
# GH#44181, GH#45135
# Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast
return self.copy()
try:
res_values = arr._where(cond, other).T
except (ValueError, TypeError) as err:
_catch_deprecated_value_error(err)
if is_interval_dtype(self.dtype):
# TestSetitemFloatIntervalWithIntIntervalValues
blk = self.coerce_to_target_dtype(other)
if blk.dtype == _dtype_obj:
# For now at least only support casting e.g.
# Interval[int64]->Interval[float64]
raise
return blk.where(other, cond)
elif isinstance(self, NDArrayBackedExtensionBlock):
# NB: not (yet) the same as
# isinstance(values, NDArrayBackedExtensionArray)
if isinstance(self.dtype, PeriodDtype):
# TODO: don't special-case
raise
blk = self.coerce_to_target_dtype(other)
nbs = blk.where(other, cond)
return self._maybe_downcast(nbs, "infer")
else:
raise
nb = self.make_block_same_class(res_values)
return [nb]
def putmask(self, mask, new) -> list[Block]:
"""
See Block.putmask.__doc__
"""
mask = extract_bool_array(mask)
values = self.values
mask = self._maybe_squeeze_arg(mask)
try:
# Caller is responsible for ensuring matching lengths
values._putmask(mask, new)
except (TypeError, ValueError) as err:
_catch_deprecated_value_error(err)
if is_interval_dtype(self.dtype):
# Discussion about what we want to support in the general
# case GH#39584
blk = self.coerce_to_target_dtype(new)
if blk.dtype == _dtype_obj:
# For now at least, only support casting e.g.
# Interval[int64]->Interval[float64],
raise
return blk.putmask(mask, new)
elif isinstance(self, NDArrayBackedExtensionBlock):
# NB: not (yet) the same as
# isinstance(values, NDArrayBackedExtensionArray)
if isinstance(self.dtype, PeriodDtype):
# TODO: don't special-case
raise
blk = self.coerce_to_target_dtype(new)
return blk.putmask(mask, new)
else:
raise
return [self]
def delete(self, loc) -> None:
"""
Delete given loc(-s) from block in-place.
"""
# This will be unnecessary if/when __array_function__ is implemented
self.values = self.values.delete(loc)
self.mgr_locs = self._mgr_locs.delete(loc)
try:
self._cache.clear()
except AttributeError:
# _cache not yet initialized
pass
@cache_readonly
def array_values(self) -> ExtensionArray:
return self.values
def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
values: ArrayLike = self.values
if dtype == _dtype_obj:
values = values.astype(object)
# TODO(EA2D): reshape not needed with 2D EAs
return np.asarray(values).reshape(self.shape)
def values_for_json(self) -> np.ndarray:
return np.asarray(self.values)
def interpolate(
self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
):
values = self.values
if values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
else:
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)
class ExtensionBlock(libinternals.Block, EABackedBlock):
"""
Block for holding extension types.
Notes
-----
This holds all 3rd-party extension array types. It's also the immediate
parent class for our internal extension types' blocks, CategoricalBlock.
ExtensionArrays are limited to 1-D.
"""
_can_consolidate = False
_validate_ndim = False
is_extension = True
values: ExtensionArray
@cache_readonly
def shape(self) -> Shape:
# TODO(EA2D): override unnecessary with 2D EAs
if self.ndim == 1:
return (len(self.values),)
return len(self._mgr_locs), len(self.values)
def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
# In the case where we have a tuple[slice, int], the slice will always
# be slice(None)
# We _could_ make the annotation more specific, but mypy would
# complain about override mismatch:
# Literal[0] | tuple[Literal[0], int] | tuple[slice, int]
# Note: only reached with self.ndim == 2
if isinstance(i, tuple):
# TODO(EA2D): unnecessary with 2D EAs
col, loc = i
if not com.is_null_slice(col) and col != 0:
raise IndexError(f"{self} only contains one item")
elif isinstance(col, slice):
if col != slice(None):
raise NotImplementedError(col)
return self.values[[loc]]
return self.values[loc]
else:
if i != 0:
raise IndexError(f"{self} only contains one item")
return self.values
def set_inplace(self, locs, values) -> None:
# NB: This is a misnomer, is supposed to be inplace but is not,
# see GH#33457
# When an ndarray, we should have locs.tolist() == [0]
# When a BlockPlacement we should have list(locs) == [0]
self.values = values
try:
# TODO(GH33457) this can be removed
self._cache.clear()
except AttributeError:
# _cache not yet initialized
pass
def _maybe_squeeze_arg(self, arg):
"""
If necessary, squeeze a (N, 1) ndarray to (N,)
"""
# e.g. if we are passed a 2D mask for putmask
if isinstance(arg, np.ndarray) and arg.ndim == self.values.ndim + 1:
# TODO(EA2D): unnecessary with 2D EAs
assert arg.shape[1] == 1
arg = arg[:, 0]
return arg
@property
def is_view(self) -> bool:
"""Extension arrays are never treated as views."""
return False
@cache_readonly
def is_numeric(self):
return self.values.dtype._is_numeric
def setitem(self, indexer, value):
"""
Attempt self.values[indexer] = value, possibly creating a new array.
This differs from Block.setitem by not allowing setitem to change
the dtype of the Block.
Parameters
----------
indexer : tuple, list-like, array-like, slice, int
The subset of self.values to set
value : object
The value being set
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
if not self._can_hold_element(value):
# see TestSetitemFloatIntervalWithIntIntervalValues
return self.coerce_to_target_dtype(value).setitem(indexer, value)
if isinstance(indexer, tuple):
# TODO(EA2D): not needed with 2D EAs
# we are always 1-D
indexer = indexer[0]
if isinstance(indexer, np.ndarray) and indexer.ndim == 2:
# GH#44703
if indexer.shape[1] != 1:
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
indexer = indexer[:, 0]
# TODO(EA2D): not needed with 2D EAS
if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2:
assert value.shape[1] == 1
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[slice, int]"
value = value[:, 0] # type: ignore[call-overload]
elif isinstance(value, ABCDataFrame):
# TODO: should we avoid getting here with DataFrame?
assert value.shape[1] == 1
value = value._ixs(0, axis=1)._values
check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
def take_nd(
self,
indexer,
axis: int = 0,
new_mgr_locs: BlockPlacement | None = None,
fill_value=lib.no_default,
) -> Block:
"""
Take values according to indexer and return them as a block.
"""
if fill_value is lib.no_default:
fill_value = None
# TODO(EA2D): special case not needed with 2D EAs
# axis doesn't matter; we are really a single-dim object
# but are passed the axis depending on the calling routing
# if its REALLY axis 0, then this will be a reindex and not a take
new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True)
# Called from three places in managers, all of which satisfy
# this assertion
assert not (self.ndim == 1 and new_mgr_locs is None)
if new_mgr_locs is None:
new_mgr_locs = self._mgr_locs
return self.make_block_same_class(new_values, new_mgr_locs)
def _slice(self, slicer) -> ExtensionArray:
"""
Return a slice of my values.
Parameters
----------
slicer : slice, ndarray[int], or a tuple of these
Valid (non-reducing) indexer for self.values.
Returns
-------
ExtensionArray
"""
# return same dims as we currently have
if not isinstance(slicer, tuple) and self.ndim == 2:
# reached via getitem_block via _slice_take_blocks_ax0
# TODO(EA2D): won't be necessary with 2D EAs
slicer = (slicer, slice(None))
if isinstance(slicer, tuple) and len(slicer) == 2:
first = slicer[0]
if not isinstance(first, slice):
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", first
)
# GH#32959 only full-slicers along fake-dim0 are valid
# TODO(EA2D): won't be necessary with 2D EAs
# range(1) instead of self._mgr_locs to avoid exception on [::-1]
# see test_iloc_getitem_slice_negative_step_ea_block
new_locs = range(1)[first]
if len(new_locs):
# effectively slice(None)
slicer = slicer[1]
else:
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", slicer
)
return self.values[slicer]
@final
def getitem_block_index(self, slicer: slice) -> ExtensionBlock:
"""
Perform __getitem__-like specialized to slicing along index.
"""
# GH#42787 in principle this is equivalent to values[..., slicer], but we don't
# require subclasses of ExtensionArray to support that form (for now).
new_values = self.values[slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:
values = self.values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=values)]
def diff(self, n: int, axis: int = 1) -> list[Block]:
if axis == 0 and n != 0:
# n==0 case will be a no-op so let is fall through
# Since we only have one column, the result will be all-NA.
# Create this result by shifting along axis=0 past the length of
# our values.
return super().diff(len(self.values), axis=0)
if axis == 1:
# TODO(EA2D): unnecessary with 2D EAs
# we are by definition 1D.
axis = 0
return super().diff(n, axis)
def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]:
"""
Shift the block by `periods`.
Dispatches to underlying ExtensionArray and re-boxes in an
ExtensionBlock.
"""
new_values = self.values.shift(periods=periods, fill_value=fill_value)
return [self.make_block_same_class(new_values)]
def _unstack(
self,
unstacker,
fill_value,
new_placement: npt.NDArray[np.intp],
needs_masking: npt.NDArray[np.bool_],
):
# ExtensionArray-safe unstack.
# We override ObjectBlock._unstack, which unstacks directly on the
# values of the array. For EA-backed blocks, this would require
# converting to a 2-D ndarray of objects.
# Instead, we unstack an ndarray of integer positions, followed by
# a `take` on the actual values.
# Caller is responsible for ensuring self.shape[-1] == len(unstacker.index)
new_values, mask = unstacker.arange_result
# Note: these next two lines ensure that
# mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
# which the calling function needs in order to pass verify_integrity=False
# to the BlockManager constructor
new_values = new_values.T[mask]
new_placement = new_placement[mask]
# needs_masking[i] calculated once in BlockManager.unstack tells
# us if there are any -1s in the relevant indices. When False,
# that allows us to go through a faster path in 'take', among
# other things avoiding e.g. Categorical._validate_scalar.
blocks = [
# TODO: could cast to object depending on fill_value?
type(self)(
self.values.take(
indices, allow_fill=needs_masking[i], fill_value=fill_value
),
BlockPlacement(place),
ndim=2,
)
for i, (indices, place) in enumerate(zip(new_values, new_placement))
]
return blocks, mask
class NumpyBlock(libinternals.NumpyBlock, Block):
values: np.ndarray
class NumericBlock(NumpyBlock):
__slots__ = ()
is_numeric = True
class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock):
"""
Block backed by an NDArrayBackedExtensionArray
"""
values: NDArrayBackedExtensionArray
# error: Signature of "is_extension" incompatible with supertype "Block"
@cache_readonly
def is_extension(self) -> bool: # type: ignore[override]
# i.e. datetime64tz, PeriodDtype
return not isinstance(self.dtype, np.dtype)
@property
def is_view(self) -> bool:
"""return a boolean if I am possibly a view"""
# check the ndarray values of the DatetimeIndex values
return self.values._ndarray.base is not None
def setitem(self, indexer, value):
if not self._can_hold_element(value):
return self.coerce_to_target_dtype(value).setitem(indexer, value)
values = self.values
if self.ndim > 1:
# Dont transpose with ndim=1 bc we would fail to invalidate
# arr.freq
values = values.T
values[indexer] = value
return self
def diff(self, n: int, axis: int = 0) -> list[Block]:
"""
1st discrete difference.
Parameters
----------
n : int
Number of periods to diff.
axis : int, default 0
Axis to diff upon.
Returns
-------
A list with a new Block.
Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
values = self.values
new_values = values - values.shift(n, axis=axis)
return [self.make_block(new_values)]
def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]:
values = self.values
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
return [self.make_block_same_class(new_values)]
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> list[Block]:
if not self._can_hold_element(value) and self.dtype.kind != "m":
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
# TODO: don't special-case td64
return self.coerce_to_target_dtype(value).fillna(
value, limit, inplace, downcast
)
new_values = self.values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=new_values)]
def _catch_deprecated_value_error(err: Exception) -> None:
"""
We catch ValueError for now, but only a specific one raised by DatetimeArray
which will no longer be raised in version.2.0.
"""
if isinstance(err, ValueError):
# TODO(2.0): once DTA._validate_setitem_value deprecation
# is enforced, stop catching ValueError here altogether
if "Timezones don't match" not in str(err):
raise
class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
"""Block for datetime64[ns], timedelta64[ns]."""
__slots__ = ()
is_numeric = False
values: DatetimeArray | TimedeltaArray
def values_for_json(self) -> np.ndarray:
# special casing datetimetz to avoid conversion through
# object dtype
return self.values._ndarray
class DatetimeTZBlock(DatetimeLikeBlock):
"""implement a datetime64 block with a tz attribute"""
values: DatetimeArray
__slots__ = ()
is_extension = True
_validate_ndim = True
_can_consolidate = False
class ObjectBlock(NumpyBlock):
__slots__ = ()
is_object = True
@maybe_split
def reduce(self, func, ignore_failures: bool = False) -> list[Block]:
"""
For object-dtype, we operate column-wise.
"""
assert self.ndim == 2
try:
res = func(self.values)
except TypeError:
if not ignore_failures:
raise
return []
assert isinstance(res, np.ndarray)
assert res.ndim == 1
res = res.reshape(1, -1)
return [self.make_block_same_class(res)]
@maybe_split
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
) -> list[Block]:
"""
attempt to cast any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
"""
values = self.values
if values.ndim == 2:
# maybe_split ensures we only get here with values.shape[0] == 1,
# avoid doing .ravel as that might make a copy
values = values[0]
res_values = soft_convert_objects(
values,
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
copy=copy,
)
res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]
class CategoricalBlock(ExtensionBlock):
# this Block type is kept for backwards-compatibility
__slots__ = ()
# GH#43232, GH#43334 self.values.dtype can be changed inplace until 2.0,
# so this cannot be cached
@property
def dtype(self) -> DtypeObj:
return self.values.dtype
# -----------------------------------------------------------------
# Constructor Helpers
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
"""
Input validation for values passed to __init__. Ensure that
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
that we do not have string dtypes.
Parameters
----------
values : np.ndarray or ExtensionArray
Returns
-------
values : np.ndarray or ExtensionArray
"""
# Caller is responsible for ensuring PandasArray is already extracted.
if isinstance(values, np.ndarray):
values = ensure_wrapped_if_datetimelike(values)
if issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)
if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
# freq is only stored in DatetimeIndex/TimedeltaIndex, not in Series/DataFrame
values = values._with_freq(None)
return values
def get_block_type(dtype: DtypeObj):
"""
Find the appropriate Block subclass to use for the given values and dtype.
Parameters
----------
dtype : numpy or pandas dtype
Returns
-------
cls : class, subclass of Block
"""
# We use vtype and kind checks because they are much more performant
# than is_foo_dtype
vtype = dtype.type
kind = dtype.kind
cls: type[Block]
if isinstance(dtype, SparseDtype):
# Need this first(ish) so that Sparse[datetime] is sparse
cls = ExtensionBlock
elif isinstance(dtype, CategoricalDtype):
cls = CategoricalBlock
elif vtype is Timestamp:
cls = DatetimeTZBlock
elif isinstance(dtype, PeriodDtype):
cls = NDArrayBackedExtensionBlock
elif isinstance(dtype, ExtensionDtype):
# Note: need to be sure PandasArray is unwrapped before we get here
cls = ExtensionBlock
elif kind in ["M", "m"]:
cls = DatetimeLikeBlock
elif kind in ["f", "c", "i", "u", "b"]:
cls = NumericBlock
else:
cls = ObjectBlock
return cls
def new_block_2d(values: ArrayLike, placement: BlockPlacement):
# new_block specialized to case with
# ndim=2
# isinstance(placement, BlockPlacement)
# check_ndim/ensure_block_shape already checked
klass = get_block_type(values.dtype)
values = maybe_coerce_values(values)
return klass(values, ndim=2, placement=placement)
def new_block(values, placement, *, ndim: int) -> Block:
# caller is responsible for ensuring values is NOT a PandasArray
if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)
check_ndim(values, placement, ndim)
klass = get_block_type(values.dtype)
values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement)
def check_ndim(values, placement: BlockPlacement, ndim: int):
"""
ndim inference and validation.
Validates that values.ndim and ndim are consistent.
Validates that len(values) and len(placement) are consistent.
Parameters
----------
values : array-like
placement : BlockPlacement
ndim : int
Raises
------
ValueError : the number of dimensions do not match
"""
if values.ndim > ndim:
# Check for both np.ndarray and ExtensionArray
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim > ndim [{values.ndim} > {ndim}]"
)
elif not is_1d_only_ea_dtype(values.dtype):
# TODO(EA2D): special case not needed with 2D EAs
if values.ndim != ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
if len(placement) != len(values):
raise ValueError(
f"Wrong number of items passed {len(values)}, "
f"placement implies {len(placement)}"
)
elif ndim == 2 and len(placement) != 1:
# TODO(EA2D): special case unnecessary with 2D EAs
raise ValueError("need to split")
def extract_pandas_array(
values: np.ndarray | ExtensionArray, dtype: DtypeObj | None, ndim: int
) -> tuple[np.ndarray | ExtensionArray, DtypeObj | None]:
"""
Ensure that we don't allow PandasArray / PandasDtype in internals.
"""
# For now, blocks should be backed by ndarrays when possible.
if isinstance(values, ABCPandasArray):
values = values.to_numpy()
if ndim and ndim > 1:
# TODO(EA2D): special case not needed with 2D EAs
values = np.atleast_2d(values)
if isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
return values, dtype
# -----------------------------------------------------------------
def extend_blocks(result, blocks=None) -> list[Block]:
"""return a new extended blocks, given the result"""
if blocks is None:
blocks = []
if isinstance(result, list):
for r in result:
if isinstance(r, list):
blocks.extend(r)
else:
blocks.append(r)
else:
assert isinstance(result, Block), type(result)
blocks.append(result)
return blocks
def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
"""
Reshape if possible to have values.ndim == ndim.
"""
if values.ndim < ndim:
if not is_1d_only_ea_dtype(values.dtype):
# TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023
# block.shape is incorrect for "2D" ExtensionArrays
# We can't, and don't need to, reshape.
values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values)
values = values.reshape(1, -1)
return values
def to_native_types(
values: ArrayLike,
*,
na_rep="nan",
quoting=None,
float_format=None,
decimal=".",
**kwargs,
) -> np.ndarray:
"""convert to our native types format"""
if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm":
# GH#40754 Convert categorical datetimes to datetime array
values = take_nd(
values.categories._values,
ensure_platform_int(values._codes),
fill_value=na_rep,
)
values = ensure_wrapped_if_datetimelike(values)
if isinstance(values, (DatetimeArray, TimedeltaArray)):
if values.ndim == 1:
result = values._format_native_types(na_rep=na_rep, **kwargs)
result = result.astype(object, copy=False)
return result
# GH#21734 Process every column separately, they might have different formats
results_converted = []
for i in range(len(values)):
result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs)
results_converted.append(result.astype(object, copy=False))
return np.vstack(results_converted)
elif isinstance(values, ExtensionArray):
mask = isna(values)
new_values = np.asarray(values.astype(object))
new_values[mask] = na_rep
return new_values
elif values.dtype.kind == "f":
# see GH#13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
if float_format is None and decimal == ".":
mask = isna(values)
if not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
values = values.astype(object, copy=False)
return values
from pandas.io.formats.format import FloatArrayFormatter
formatter = FloatArrayFormatter(
values,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
quoting=quoting,
fixed_width=False,
)
res = formatter.get_result_as_array()
res = res.astype(object, copy=False)
return res
else:
mask = isna(values)
itemsize = writers.word_len(na_rep)
if values.dtype != _dtype_obj and not quoting and itemsize:
values = values.astype(str)
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
# enlarge for the na_rep
values = values.astype(f"<U{itemsize}")
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
values = values.astype(object, copy=False)
return values
def external_values(values: ArrayLike) -> ArrayLike:
"""
The array that Series.values returns (public attribute).
This has some historical constraints, and is overridden in block
subclasses to return the correct array (e.g. period returns
object ndarray and datetimetz a datetime64[ns] ndarray instead of
proper extension array).
"""
if isinstance(values, (PeriodArray, IntervalArray)):
return values.astype(object)
elif isinstance(values, (DatetimeArray, TimedeltaArray)):
# NB: for datetime64tz this is different from np.asarray(values), since
# that returns an object-dtype ndarray of Timestamps.
# Avoid FutureWarning in .astype in casting from dt64tz to dt64
return values._data
else:
return values