针对pulse-transit的工具
This commit is contained in:
13
dist/client/pandas/core/arrays/sparse/__init__.py
vendored
Normal file
13
dist/client/pandas/core/arrays/sparse/__init__.py
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# flake8: noqa: F401
|
||||
|
||||
from pandas.core.arrays.sparse.accessor import (
|
||||
SparseAccessor,
|
||||
SparseFrameAccessor,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import (
|
||||
BlockIndex,
|
||||
IntIndex,
|
||||
SparseArray,
|
||||
make_sparse_index,
|
||||
)
|
||||
from pandas.core.arrays.sparse.dtype import SparseDtype
|
||||
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/__init__.cpython-310.pyc
vendored
Normal file
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/__init__.cpython-310.pyc
vendored
Normal file
Binary file not shown.
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/accessor.cpython-310.pyc
vendored
Normal file
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/accessor.cpython-310.pyc
vendored
Normal file
Binary file not shown.
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/array.cpython-310.pyc
vendored
Normal file
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/array.cpython-310.pyc
vendored
Normal file
Binary file not shown.
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/dtype.cpython-310.pyc
vendored
Normal file
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/dtype.cpython-310.pyc
vendored
Normal file
Binary file not shown.
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-310.pyc
vendored
Normal file
BIN
dist/client/pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-310.pyc
vendored
Normal file
Binary file not shown.
386
dist/client/pandas/core/arrays/sparse/accessor.py
vendored
Normal file
386
dist/client/pandas/core/arrays/sparse/accessor.py
vendored
Normal file
@@ -0,0 +1,386 @@
|
||||
"""Sparse accessor"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
|
||||
from pandas.core.accessor import (
|
||||
PandasDelegate,
|
||||
delegate_names,
|
||||
)
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
from pandas.core.arrays.sparse.dtype import SparseDtype
|
||||
|
||||
|
||||
class BaseAccessor:
|
||||
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
|
||||
|
||||
def __init__(self, data=None):
|
||||
self._parent = data
|
||||
self._validate(data)
|
||||
|
||||
def _validate(self, data):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@delegate_names(
|
||||
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
|
||||
)
|
||||
class SparseAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
Accessor for SparseSparse from other sparse matrix data types.
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
if not isinstance(data.dtype, SparseDtype):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
def _delegate_property_get(self, name, *args, **kwargs):
|
||||
return getattr(self._parent.array, name)
|
||||
|
||||
def _delegate_method(self, name, *args, **kwargs):
|
||||
if name == "from_coo":
|
||||
return self.from_coo(*args, **kwargs)
|
||||
elif name == "to_coo":
|
||||
return self.to_coo(*args, **kwargs)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
@classmethod
|
||||
def from_coo(cls, A, dense_index=False):
|
||||
"""
|
||||
Create a Series with sparse values from a scipy.sparse.coo_matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
If False (default), the SparseSeries index consists of only the
|
||||
coords of the non-null entries of the original coo_matrix.
|
||||
If True, the SparseSeries index consists of the full sorted
|
||||
(row, col) coordinates of the coo_matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : Series
|
||||
A Series with sparse values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import sparse
|
||||
|
||||
>>> A = sparse.coo_matrix(
|
||||
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
|
||||
... )
|
||||
>>> A
|
||||
<3x4 sparse matrix of type '<class 'numpy.float64'>'
|
||||
with 3 stored elements in COOrdinate format>
|
||||
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 2.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> ss = pd.Series.sparse.from_coo(A)
|
||||
>>> ss
|
||||
0 2 1.0
|
||||
3 2.0
|
||||
1 0 3.0
|
||||
dtype: Sparse[float64, nan]
|
||||
"""
|
||||
from pandas import Series
|
||||
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
|
||||
|
||||
result = coo_to_sparse_series(A, dense_index=dense_index)
|
||||
result = Series(result.array, index=result.index, copy=False)
|
||||
|
||||
return result
|
||||
|
||||
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
|
||||
"""
|
||||
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
|
||||
|
||||
Use row_levels and column_levels to determine the row and column
|
||||
coordinates respectively. row_levels and column_levels are the names
|
||||
(labels) or numbers of the levels. {row_levels, column_levels} must be
|
||||
a partition of the MultiIndex level names (or numbers).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : scipy.sparse.coo_matrix
|
||||
rows : list (row labels)
|
||||
columns : list (column labels)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
|
||||
>>> s.index = pd.MultiIndex.from_tuples(
|
||||
... [
|
||||
... (1, 2, "a", 0),
|
||||
... (1, 2, "a", 1),
|
||||
... (1, 1, "b", 0),
|
||||
... (1, 1, "b", 1),
|
||||
... (2, 1, "b", 0),
|
||||
... (2, 1, "b", 1)
|
||||
... ],
|
||||
... names=["A", "B", "C", "D"],
|
||||
... )
|
||||
>>> s
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: float64
|
||||
|
||||
>>> ss = s.astype("Sparse")
|
||||
>>> ss
|
||||
A B C D
|
||||
1 2 a 0 3.0
|
||||
1 NaN
|
||||
1 b 0 1.0
|
||||
1 3.0
|
||||
2 1 b 0 NaN
|
||||
1 NaN
|
||||
dtype: Sparse[float64, nan]
|
||||
|
||||
>>> A, rows, columns = ss.sparse.to_coo(
|
||||
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
|
||||
... )
|
||||
>>> A
|
||||
<3x4 sparse matrix of type '<class 'numpy.float64'>'
|
||||
with 3 stored elements in COOrdinate format>
|
||||
>>> A.todense()
|
||||
matrix([[0., 0., 1., 3.],
|
||||
[3., 0., 0., 0.],
|
||||
[0., 0., 0., 0.]])
|
||||
|
||||
>>> rows
|
||||
[(1, 1), (1, 2), (2, 1)]
|
||||
>>> columns
|
||||
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
|
||||
"""
|
||||
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
|
||||
|
||||
A, rows, columns = sparse_series_to_coo(
|
||||
self._parent, row_levels, column_levels, sort_labels=sort_labels
|
||||
)
|
||||
return A, rows, columns
|
||||
|
||||
def to_dense(self):
|
||||
"""
|
||||
Convert a Series from sparse values to dense.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series:
|
||||
A Series with the same values, stored as a dense array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
|
||||
>>> series
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: Sparse[int64, 0]
|
||||
|
||||
>>> series.sparse.to_dense()
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
dtype: int64
|
||||
"""
|
||||
from pandas import Series
|
||||
|
||||
return Series(
|
||||
self._parent.array.to_dense(),
|
||||
index=self._parent.index,
|
||||
name=self._parent.name,
|
||||
)
|
||||
|
||||
|
||||
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
|
||||
"""
|
||||
DataFrame accessor for sparse data.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
"""
|
||||
|
||||
def _validate(self, data):
|
||||
dtypes = data.dtypes
|
||||
if not all(isinstance(t, SparseDtype) for t in dtypes):
|
||||
raise AttributeError(self._validation_msg)
|
||||
|
||||
@classmethod
|
||||
def from_spmatrix(cls, data, index=None, columns=None):
|
||||
"""
|
||||
Create a new DataFrame from a scipy sparse matrix.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : scipy.sparse.spmatrix
|
||||
Must be convertible to csc format.
|
||||
index, columns : Index, optional
|
||||
Row and column labels to use for the resulting DataFrame.
|
||||
Defaults to a RangeIndex.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Each column of the DataFrame is stored as a
|
||||
:class:`arrays.SparseArray`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.sparse
|
||||
>>> mat = scipy.sparse.eye(3)
|
||||
>>> pd.DataFrame.sparse.from_spmatrix(mat)
|
||||
0 1 2
|
||||
0 1.0 0.0 0.0
|
||||
1 0.0 1.0 0.0
|
||||
2 0.0 0.0 1.0
|
||||
"""
|
||||
from pandas._libs.sparse import IntIndex
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
data = data.tocsc()
|
||||
index, columns = cls._prep_index(data, index, columns)
|
||||
n_rows, n_columns = data.shape
|
||||
# We need to make sure indices are sorted, as we create
|
||||
# IntIndex with no input validation (i.e. check_integrity=False ).
|
||||
# Indices may already be sorted in scipy in which case this adds
|
||||
# a small overhead.
|
||||
data.sort_indices()
|
||||
indices = data.indices
|
||||
indptr = data.indptr
|
||||
array_data = data.data
|
||||
dtype = SparseDtype(array_data.dtype, 0)
|
||||
arrays = []
|
||||
for i in range(n_columns):
|
||||
sl = slice(indptr[i], indptr[i + 1])
|
||||
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
|
||||
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
|
||||
arrays.append(arr)
|
||||
return DataFrame._from_arrays(
|
||||
arrays, columns=columns, index=index, verify_integrity=False
|
||||
)
|
||||
|
||||
def to_dense(self):
|
||||
"""
|
||||
Convert a DataFrame with sparse values to dense.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame with the same values stored as dense arrays.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
|
||||
>>> df.sparse.to_dense()
|
||||
A
|
||||
0 0
|
||||
1 1
|
||||
2 0
|
||||
"""
|
||||
from pandas import DataFrame
|
||||
|
||||
data = {k: v.array.to_dense() for k, v in self._parent.items()}
|
||||
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
|
||||
|
||||
def to_coo(self):
|
||||
"""
|
||||
Return the contents of the frame as a sparse SciPy COO matrix.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
coo_matrix : scipy.sparse.spmatrix
|
||||
If the caller is heterogeneous and contains booleans or objects,
|
||||
the result will be of dtype=object. See Notes.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The dtype will be the lowest-common-denominator type (implicit
|
||||
upcasting); that is to say if the dtypes (even of numeric types)
|
||||
are mixed, the one that accommodates all will be chosen.
|
||||
|
||||
e.g. If the dtypes are float16 and float32, dtype will be upcast to
|
||||
float32. By numpy.find_common_type convention, mixing int64 and
|
||||
and uint64 will result in a float64 dtype.
|
||||
"""
|
||||
import_optional_dependency("scipy")
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
dtype = find_common_type(self._parent.dtypes.to_list())
|
||||
if isinstance(dtype, SparseDtype):
|
||||
dtype = dtype.subtype
|
||||
|
||||
cols, rows, data = [], [], []
|
||||
for col, (_, ser) in enumerate(self._parent.iteritems()):
|
||||
sp_arr = ser.array
|
||||
if sp_arr.fill_value != 0:
|
||||
raise ValueError("fill value must be 0 when converting to COO matrix")
|
||||
|
||||
row = sp_arr.sp_index.indices
|
||||
cols.append(np.repeat(col, len(row)))
|
||||
rows.append(row)
|
||||
data.append(sp_arr.sp_values.astype(dtype, copy=False))
|
||||
|
||||
cols = np.concatenate(cols)
|
||||
rows = np.concatenate(rows)
|
||||
data = np.concatenate(data)
|
||||
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
|
||||
|
||||
@property
|
||||
def density(self) -> float:
|
||||
"""
|
||||
Ratio of non-sparse points to total (dense) data points.
|
||||
"""
|
||||
tmp = np.mean([column.array.density for _, column in self._parent.items()])
|
||||
return tmp
|
||||
|
||||
@staticmethod
|
||||
def _prep_index(data, index, columns):
|
||||
from pandas.core.indexes.api import (
|
||||
default_index,
|
||||
ensure_index,
|
||||
)
|
||||
|
||||
N, K = data.shape
|
||||
if index is None:
|
||||
index = default_index(N)
|
||||
else:
|
||||
index = ensure_index(index)
|
||||
if columns is None:
|
||||
columns = default_index(K)
|
||||
else:
|
||||
columns = ensure_index(columns)
|
||||
|
||||
if len(columns) != K:
|
||||
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
|
||||
if len(index) != N:
|
||||
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
|
||||
return index, columns
|
||||
1853
dist/client/pandas/core/arrays/sparse/array.py
vendored
Normal file
1853
dist/client/pandas/core/arrays/sparse/array.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
414
dist/client/pandas/core/arrays/sparse/dtype.py
vendored
Normal file
414
dist/client/pandas/core/arrays/sparse/dtype.py
vendored
Normal file
@@ -0,0 +1,414 @@
|
||||
"""Sparse Dtype"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
)
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
Dtype,
|
||||
DtypeObj,
|
||||
type_t,
|
||||
)
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._exceptions import find_stack_level
|
||||
|
||||
from pandas.core.dtypes.base import (
|
||||
ExtensionDtype,
|
||||
register_extension_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.cast import astype_nansafe
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_object_dtype,
|
||||
is_scalar,
|
||||
is_string_dtype,
|
||||
pandas_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.missing import (
|
||||
isna,
|
||||
na_value_for_dtype,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
|
||||
|
||||
@register_extension_dtype
|
||||
class SparseDtype(ExtensionDtype):
|
||||
"""
|
||||
Dtype for data stored in :class:`SparseArray`.
|
||||
|
||||
This dtype implements the pandas ExtensionDtype interface.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
|
||||
The dtype of the underlying array storing the non-fill value values.
|
||||
fill_value : scalar, optional
|
||||
The scalar value not stored in the SparseArray. By default, this
|
||||
depends on `dtype`.
|
||||
|
||||
=========== ==========
|
||||
dtype na_value
|
||||
=========== ==========
|
||||
float ``np.nan``
|
||||
int ``0``
|
||||
bool ``False``
|
||||
datetime64 ``pd.NaT``
|
||||
timedelta64 ``pd.NaT``
|
||||
=========== ==========
|
||||
|
||||
The default value may be overridden by specifying a `fill_value`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
"""
|
||||
|
||||
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
|
||||
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
|
||||
# Without is_na_fill_value in the comparison, those would be equal since
|
||||
# hash(nan) is (sometimes?) 0.
|
||||
_metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
|
||||
|
||||
def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
|
||||
|
||||
if isinstance(dtype, type(self)):
|
||||
if fill_value is None:
|
||||
fill_value = dtype.fill_value
|
||||
dtype = dtype.subtype
|
||||
|
||||
dtype = pandas_dtype(dtype)
|
||||
if is_string_dtype(dtype):
|
||||
dtype = np.dtype("object")
|
||||
|
||||
if fill_value is None:
|
||||
fill_value = na_value_for_dtype(dtype)
|
||||
|
||||
self._dtype = dtype
|
||||
self._fill_value = fill_value
|
||||
self._check_fill_value()
|
||||
|
||||
def __hash__(self):
|
||||
# Python3 doesn't inherit __hash__ when a base class overrides
|
||||
# __eq__, so we explicitly do it here.
|
||||
return super().__hash__()
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
# We have to override __eq__ to handle NA values in _metadata.
|
||||
# The base class does simple == checks, which fail for NA.
|
||||
if isinstance(other, str):
|
||||
try:
|
||||
other = self.construct_from_string(other)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
if isinstance(other, type(self)):
|
||||
subtype = self.subtype == other.subtype
|
||||
if self._is_na_fill_value:
|
||||
# this case is complicated by two things:
|
||||
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
|
||||
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
|
||||
# i.e. we want to treat any floating-point NaN as equal, but
|
||||
# not a floating-point NaN and a datetime NaT.
|
||||
fill_value = (
|
||||
other._is_na_fill_value
|
||||
and isinstance(self.fill_value, type(other.fill_value))
|
||||
or isinstance(other.fill_value, type(self.fill_value))
|
||||
)
|
||||
else:
|
||||
fill_value = self.fill_value == other.fill_value
|
||||
|
||||
return subtype and fill_value
|
||||
return False
|
||||
|
||||
@property
|
||||
def fill_value(self):
|
||||
"""
|
||||
The fill value of the array.
|
||||
|
||||
Converting the SparseArray to a dense ndarray will fill the
|
||||
array with this value.
|
||||
|
||||
.. warning::
|
||||
|
||||
It's possible to end up with a SparseArray that has ``fill_value``
|
||||
values in ``sp_values``. This can occur, for example, when setting
|
||||
``SparseArray.fill_value`` directly.
|
||||
"""
|
||||
return self._fill_value
|
||||
|
||||
def _check_fill_value(self):
|
||||
if not is_scalar(self._fill_value):
|
||||
raise ValueError(
|
||||
f"fill_value must be a scalar. Got {self._fill_value} instead"
|
||||
)
|
||||
# TODO: Right now we can use Sparse boolean array
|
||||
# with any fill_value. Here was an attempt
|
||||
# to allow only 3 value: True, False or nan
|
||||
# but plenty test has failed.
|
||||
# see pull 44955
|
||||
# if self._is_boolean and not (
|
||||
# is_bool(self._fill_value) or isna(self._fill_value)
|
||||
# ):
|
||||
# raise ValueError(
|
||||
# "fill_value must be True, False or nan "
|
||||
# f"for boolean type. Got {self._fill_value} instead"
|
||||
# )
|
||||
|
||||
@property
|
||||
def _is_na_fill_value(self) -> bool:
|
||||
return isna(self.fill_value)
|
||||
|
||||
@property
|
||||
def _is_numeric(self) -> bool:
|
||||
return not is_object_dtype(self.subtype)
|
||||
|
||||
@property
|
||||
def _is_boolean(self) -> bool:
|
||||
return is_bool_dtype(self.subtype)
|
||||
|
||||
@property
|
||||
def kind(self):
|
||||
"""
|
||||
The sparse kind. Either 'integer', or 'block'.
|
||||
"""
|
||||
return self.subtype.kind
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.subtype.type
|
||||
|
||||
@property
|
||||
def subtype(self):
|
||||
return self._dtype
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.name
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls) -> type_t[SparseArray]:
|
||||
"""
|
||||
Return the array type associated with this dtype.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
"""
|
||||
from pandas.core.arrays.sparse.array import SparseArray
|
||||
|
||||
return SparseArray
|
||||
|
||||
@classmethod
|
||||
def construct_from_string(cls, string: str) -> SparseDtype:
|
||||
"""
|
||||
Construct a SparseDtype from a string form.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
Can take the following forms.
|
||||
|
||||
string dtype
|
||||
================ ============================
|
||||
'int' SparseDtype[np.int64, 0]
|
||||
'Sparse' SparseDtype[np.float64, nan]
|
||||
'Sparse[int]' SparseDtype[np.int64, 0]
|
||||
'Sparse[int, 0]' SparseDtype[np.int64, 0]
|
||||
================ ============================
|
||||
|
||||
It is not possible to specify non-default fill values
|
||||
with a string. An argument like ``'Sparse[int, 1]'``
|
||||
will raise a ``TypeError`` because the default fill value
|
||||
for integers is 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SparseDtype
|
||||
"""
|
||||
if not isinstance(string, str):
|
||||
raise TypeError(
|
||||
f"'construct_from_string' expects a string, got {type(string)}"
|
||||
)
|
||||
msg = f"Cannot construct a 'SparseDtype' from '{string}'"
|
||||
if string.startswith("Sparse"):
|
||||
try:
|
||||
sub_type, has_fill_value = cls._parse_subtype(string)
|
||||
except ValueError as err:
|
||||
raise TypeError(msg) from err
|
||||
else:
|
||||
result = SparseDtype(sub_type)
|
||||
msg = (
|
||||
f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
|
||||
"looks like the fill_value in the string is not "
|
||||
"the default for the dtype. Non-default fill_values "
|
||||
"are not supported. Use the 'SparseDtype()' "
|
||||
"constructor instead."
|
||||
)
|
||||
if has_fill_value and str(result) != string:
|
||||
raise TypeError(msg)
|
||||
return result
|
||||
else:
|
||||
raise TypeError(msg)
|
||||
|
||||
@staticmethod
|
||||
def _parse_subtype(dtype: str) -> tuple[str, bool]:
|
||||
"""
|
||||
Parse a string to get the subtype
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : str
|
||||
A string like
|
||||
|
||||
* Sparse[subtype]
|
||||
* Sparse[subtype, fill_value]
|
||||
|
||||
Returns
|
||||
-------
|
||||
subtype : str
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When the subtype cannot be extracted.
|
||||
"""
|
||||
xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
|
||||
m = xpr.match(dtype)
|
||||
has_fill_value = False
|
||||
if m:
|
||||
subtype = m.groupdict()["subtype"]
|
||||
has_fill_value = bool(m.groupdict()["fill_value"])
|
||||
elif dtype == "Sparse":
|
||||
subtype = "float64"
|
||||
else:
|
||||
raise ValueError(f"Cannot parse {dtype}")
|
||||
return subtype, has_fill_value
|
||||
|
||||
@classmethod
|
||||
def is_dtype(cls, dtype: object) -> bool:
|
||||
dtype = getattr(dtype, "dtype", dtype)
|
||||
if isinstance(dtype, str) and dtype.startswith("Sparse"):
|
||||
sub_type, _ = cls._parse_subtype(dtype)
|
||||
dtype = np.dtype(sub_type)
|
||||
elif isinstance(dtype, cls):
|
||||
return True
|
||||
return isinstance(dtype, np.dtype) or dtype == "Sparse"
|
||||
|
||||
def update_dtype(self, dtype) -> SparseDtype:
|
||||
"""
|
||||
Convert the SparseDtype to a new dtype.
|
||||
|
||||
This takes care of converting the ``fill_value``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : Union[str, numpy.dtype, SparseDtype]
|
||||
The new dtype to use.
|
||||
|
||||
* For a SparseDtype, it is simply returned
|
||||
* For a NumPy dtype (or str), the current fill value
|
||||
is converted to the new dtype, and a SparseDtype
|
||||
with `dtype` and the new fill value is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SparseDtype
|
||||
A new SparseDtype with the correct `dtype` and fill value
|
||||
for that `dtype`.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When the current fill value cannot be converted to the
|
||||
new `dtype` (e.g. trying to convert ``np.nan`` to an
|
||||
integer dtype).
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> SparseDtype(int, 0).update_dtype(float)
|
||||
Sparse[float64, 0.0]
|
||||
|
||||
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
|
||||
Sparse[float64, nan]
|
||||
"""
|
||||
cls = type(self)
|
||||
dtype = pandas_dtype(dtype)
|
||||
|
||||
if not isinstance(dtype, cls):
|
||||
if not isinstance(dtype, np.dtype):
|
||||
raise TypeError("sparse arrays of extension dtypes not supported")
|
||||
|
||||
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
|
||||
dtype = cls(dtype, fill_value=fill_value)
|
||||
|
||||
return dtype
|
||||
|
||||
@property
|
||||
def _subtype_with_str(self):
|
||||
"""
|
||||
Whether the SparseDtype's subtype should be considered ``str``.
|
||||
|
||||
Typically, pandas will store string data in an object-dtype array.
|
||||
When converting values to a dtype, e.g. in ``.astype``, we need to
|
||||
be more specific, we need the actual underlying type.
|
||||
|
||||
Returns
|
||||
-------
|
||||
>>> SparseDtype(int, 1)._subtype_with_str
|
||||
dtype('int64')
|
||||
|
||||
>>> SparseDtype(object, 1)._subtype_with_str
|
||||
dtype('O')
|
||||
|
||||
>>> dtype = SparseDtype(str, '')
|
||||
>>> dtype.subtype
|
||||
dtype('O')
|
||||
|
||||
>>> dtype._subtype_with_str
|
||||
<class 'str'>
|
||||
"""
|
||||
if isinstance(self.fill_value, str):
|
||||
return type(self.fill_value)
|
||||
return self.subtype
|
||||
|
||||
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
|
||||
# TODO for now only handle SparseDtypes and numpy dtypes => extend
|
||||
# with other compatible extension dtypes
|
||||
if any(
|
||||
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
|
||||
for x in dtypes
|
||||
):
|
||||
return None
|
||||
|
||||
fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
|
||||
fill_value = fill_values[0]
|
||||
|
||||
# np.nan isn't a singleton, so we may end up with multiple
|
||||
# NaNs here, so we ignore the all NA case too.
|
||||
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
|
||||
warnings.warn(
|
||||
"Concatenating sparse arrays with multiple fill "
|
||||
f"values: '{fill_values}'. Picking the first and "
|
||||
"converting the rest.",
|
||||
PerformanceWarning,
|
||||
stacklevel=find_stack_level(),
|
||||
)
|
||||
|
||||
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
|
||||
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
|
||||
211
dist/client/pandas/core/arrays/sparse/scipy_sparse.py
vendored
Normal file
211
dist/client/pandas/core/arrays/sparse/scipy_sparse.py
vendored
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Interaction with scipy.sparse matrices.
|
||||
|
||||
Currently only includes to_coo helpers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Iterable,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import lib
|
||||
from pandas._typing import (
|
||||
IndexLabel,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.core.algorithms import factorize
|
||||
from pandas.core.indexes.api import MultiIndex
|
||||
from pandas.core.series import Series
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import scipy.sparse
|
||||
|
||||
|
||||
def _check_is_partition(parts: Iterable, whole: Iterable):
|
||||
whole = set(whole)
|
||||
parts = [set(x) for x in parts]
|
||||
if set.intersection(*parts) != set():
|
||||
raise ValueError("Is not a partition because intersection is not null.")
|
||||
if set.union(*parts) != whole:
|
||||
raise ValueError("Is not a partition because union is not the whole.")
|
||||
|
||||
|
||||
def _levels_to_axis(
|
||||
ss,
|
||||
levels: tuple[int] | list[int],
|
||||
valid_ilocs: npt.NDArray[np.intp],
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
|
||||
"""
|
||||
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
|
||||
where `ax_coords` are the coordinates along one of the two axes of the
|
||||
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
|
||||
which correspond to these coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
levels : tuple/list
|
||||
valid_ilocs : numpy.ndarray
|
||||
Array of integer positions of valid values for the sparse matrix in ss.
|
||||
sort_labels : bool, default False
|
||||
Sort the axis labels before forming the sparse matrix. When `levels`
|
||||
refers to a single level, set to True for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ax_coords : numpy.ndarray (axis coordinates)
|
||||
ax_labels : list (axis labels)
|
||||
"""
|
||||
# Since the labels are sorted in `Index.levels`, when we wish to sort and
|
||||
# there is only one level of the MultiIndex for this axis, the desired
|
||||
# output can be obtained in the following simpler, more efficient way.
|
||||
if sort_labels and len(levels) == 1:
|
||||
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
|
||||
ax_labels = ss.index.levels[levels[0]]
|
||||
|
||||
else:
|
||||
levels_values = lib.fast_zip(
|
||||
[ss.index.get_level_values(lvl).values for lvl in levels]
|
||||
)
|
||||
codes, ax_labels = factorize(levels_values, sort=sort_labels)
|
||||
ax_coords = codes[valid_ilocs]
|
||||
|
||||
ax_labels = ax_labels.tolist()
|
||||
return ax_coords, ax_labels
|
||||
|
||||
|
||||
def _to_ijv(
|
||||
ss,
|
||||
row_levels: tuple[int] | list[int] = (0,),
|
||||
column_levels: tuple[int] | list[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[
|
||||
np.ndarray,
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
list[IndexLabel],
|
||||
list[IndexLabel],
|
||||
]:
|
||||
"""
|
||||
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
|
||||
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
|
||||
constructor, and ilabels and jlabels are the row and column labels
|
||||
respectively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ss : Series
|
||||
row_levels : tuple/list
|
||||
column_levels : tuple/list
|
||||
sort_labels : bool, default False
|
||||
Sort the row and column labels before forming the sparse matrix.
|
||||
When `row_levels` and/or `column_levels` refer to a single level,
|
||||
set to `True` for a faster execution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : numpy.ndarray
|
||||
Valid values to populate a sparse matrix, extracted from
|
||||
ss.
|
||||
i_coords : numpy.ndarray (row coordinates of the values)
|
||||
j_coords : numpy.ndarray (column coordinates of the values)
|
||||
i_labels : list (row labels)
|
||||
j_labels : list (column labels)
|
||||
"""
|
||||
# index and column levels must be a partition of the index
|
||||
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
|
||||
# From the sparse Series, get the integer indices and data for valid sparse
|
||||
# entries.
|
||||
sp_vals = ss.array.sp_values
|
||||
na_mask = notna(sp_vals)
|
||||
values = sp_vals[na_mask]
|
||||
valid_ilocs = ss.array.sp_index.indices[na_mask]
|
||||
|
||||
i_coords, i_labels = _levels_to_axis(
|
||||
ss, row_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
j_coords, j_labels = _levels_to_axis(
|
||||
ss, column_levels, valid_ilocs, sort_labels=sort_labels
|
||||
)
|
||||
|
||||
return values, i_coords, j_coords, i_labels, j_labels
|
||||
|
||||
|
||||
def sparse_series_to_coo(
|
||||
ss: Series,
|
||||
row_levels: Iterable[int] = (0,),
|
||||
column_levels: Iterable[int] = (1,),
|
||||
sort_labels: bool = False,
|
||||
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
|
||||
"""
|
||||
Convert a sparse Series to a scipy.sparse.coo_matrix using index
|
||||
levels row_levels, column_levels as the row and column
|
||||
labels respectively. Returns the sparse_matrix, row and column labels.
|
||||
"""
|
||||
import scipy.sparse
|
||||
|
||||
if ss.index.nlevels < 2:
|
||||
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
|
||||
if not ss.index.is_unique:
|
||||
raise ValueError(
|
||||
"Duplicate index entries are not allowed in to_coo transformation."
|
||||
)
|
||||
|
||||
# to keep things simple, only rely on integer indexing (not labels)
|
||||
row_levels = [ss.index._get_level_number(x) for x in row_levels]
|
||||
column_levels = [ss.index._get_level_number(x) for x in column_levels]
|
||||
|
||||
v, i, j, rows, columns = _to_ijv(
|
||||
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
|
||||
)
|
||||
sparse_matrix = scipy.sparse.coo_matrix(
|
||||
(v, (i, j)), shape=(len(rows), len(columns))
|
||||
)
|
||||
return sparse_matrix, rows, columns
|
||||
|
||||
|
||||
def coo_to_sparse_series(
|
||||
A: scipy.sparse.coo_matrix, dense_index: bool = False
|
||||
) -> Series:
|
||||
"""
|
||||
Convert a scipy.sparse.coo_matrix to a SparseSeries.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : scipy.sparse.coo_matrix
|
||||
dense_index : bool, default False
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError if A is not a coo_matrix
|
||||
"""
|
||||
from pandas import SparseDtype
|
||||
|
||||
try:
|
||||
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
|
||||
except AttributeError as err:
|
||||
raise TypeError(
|
||||
f"Expected coo_matrix. Got {type(A).__name__} instead."
|
||||
) from err
|
||||
ser = ser.sort_index()
|
||||
ser = ser.astype(SparseDtype(ser.dtype))
|
||||
if dense_index:
|
||||
# is there a better constructor method to use here?
|
||||
i = range(A.shape[0])
|
||||
j = range(A.shape[1])
|
||||
ind = MultiIndex.from_product([i, j])
|
||||
ser = ser.reindex(ind)
|
||||
return ser
|
||||
Reference in New Issue
Block a user