针对pulse-transit的工具

This commit is contained in:
2025-02-22 16:12:02 +08:00
commit 6bc25b4e3a
7719 changed files with 1530886 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
# flake8: noqa: F401
from pandas.core.arrays.sparse.accessor import (
SparseAccessor,
SparseFrameAccessor,
)
from pandas.core.arrays.sparse.array import (
BlockIndex,
IntIndex,
SparseArray,
make_sparse_index,
)
from pandas.core.arrays.sparse.dtype import SparseDtype

View File

@@ -0,0 +1,386 @@
"""Sparse accessor"""
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import find_common_type
from pandas.core.accessor import (
PandasDelegate,
delegate_names,
)
from pandas.core.arrays.sparse.array import SparseArray
from pandas.core.arrays.sparse.dtype import SparseDtype
class BaseAccessor:
_validation_msg = "Can only use the '.sparse' accessor with Sparse data."
def __init__(self, data=None):
self._parent = data
self._validate(data)
def _validate(self, data):
raise NotImplementedError
@delegate_names(
SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
"""
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
raise AttributeError(self._validation_msg)
def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.array, name)
def _delegate_method(self, name, *args, **kwargs):
if name == "from_coo":
return self.from_coo(*args, **kwargs)
elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError
@classmethod
def from_coo(cls, A, dense_index=False):
"""
Create a Series with sparse values from a scipy.sparse.coo_matrix.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the SparseSeries index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the SparseSeries index consists of the full sorted
(row, col) coordinates of the coo_matrix.
Returns
-------
s : Series
A Series with sparse values.
Examples
--------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(
... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
... )
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[0., 0., 1., 2.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> ss = pd.Series.sparse.from_coo(A)
>>> ss
0 2 1.0
3 2.0
1 0 3.0
dtype: Sparse[float64, nan]
"""
from pandas import Series
from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
result = coo_to_sparse_series(A, dense_index=dense_index)
result = Series(result.array, index=result.index, copy=False)
return result
def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).
Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)
Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples(
... [
... (1, 2, "a", 0),
... (1, 2, "a", 1),
... (1, 1, "b", 0),
... (1, 1, "b", 1),
... (2, 1, "b", 0),
... (2, 1, "b", 1)
... ],
... names=["A", "B", "C", "D"],
... )
>>> s
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: float64
>>> ss = s.astype("Sparse")
>>> ss
A B C D
1 2 a 0 3.0
1 NaN
1 b 0 1.0
1 3.0
2 1 b 0 NaN
1 NaN
dtype: Sparse[float64, nan]
>>> A, rows, columns = ss.sparse.to_coo(
... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
... )
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[0., 0., 1., 3.],
[3., 0., 0., 0.],
[0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
A, rows, columns = sparse_series_to_coo(
self._parent, row_levels, column_levels, sort_labels=sort_labels
)
return A, rows, columns
def to_dense(self):
"""
Convert a Series from sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
Series:
A Series with the same values, stored as a dense array.
Examples
--------
>>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
>>> series
0 0
1 1
2 0
dtype: Sparse[int64, 0]
>>> series.sparse.to_dense()
0 0
1 1
2 0
dtype: int64
"""
from pandas import Series
return Series(
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
)
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
"""
DataFrame accessor for sparse data.
.. versionadded:: 0.25.0
"""
def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(self._validation_msg)
@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Create a new DataFrame from a scipy sparse matrix.
.. versionadded:: 0.25.0
Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.
Returns
-------
DataFrame
Each column of the DataFrame is stored as a
:class:`arrays.SparseArray`.
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas._libs.sparse import IntIndex
from pandas import DataFrame
data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
n_rows, n_columns = data.shape
# We need to make sure indices are sorted, as we create
# IntIndex with no input validation (i.e. check_integrity=False ).
# Indices may already be sorted in scipy in which case this adds
# a small overhead.
data.sort_indices()
indices = data.indices
indptr = data.indptr
array_data = data.data
dtype = SparseDtype(array_data.dtype, 0)
arrays = []
for i in range(n_columns):
sl = slice(indptr[i], indptr[i + 1])
idx = IntIndex(n_rows, indices[sl], check_integrity=False)
arr = SparseArray._simple_new(array_data[sl], idx, dtype)
arrays.append(arr)
return DataFrame._from_arrays(
arrays, columns=columns, index=index, verify_integrity=False
)
def to_dense(self):
"""
Convert a DataFrame with sparse values to dense.
.. versionadded:: 0.25.0
Returns
-------
DataFrame
A DataFrame with the same values stored as dense arrays.
Examples
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
>>> df.sparse.to_dense()
A
0 0
1 1
2 0
"""
from pandas import DataFrame
data = {k: v.array.to_dense() for k, v in self._parent.items()}
return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
.. versionadded:: 0.25.0
Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
import_optional_dependency("scipy")
from scipy.sparse import coo_matrix
dtype = find_common_type(self._parent.dtypes.to_list())
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
cols, rows, data = [], [], []
for col, (_, ser) in enumerate(self._parent.iteritems()):
sp_arr = ser.array
if sp_arr.fill_value != 0:
raise ValueError("fill value must be 0 when converting to COO matrix")
row = sp_arr.sp_index.indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
data.append(sp_arr.sp_values.astype(dtype, copy=False))
cols = np.concatenate(cols)
rows = np.concatenate(rows)
data = np.concatenate(data)
return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
@property
def density(self) -> float:
"""
Ratio of non-sparse points to total (dense) data points.
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
@staticmethod
def _prep_index(data, index, columns):
from pandas.core.indexes.api import (
default_index,
ensure_index,
)
N, K = data.shape
if index is None:
index = default_index(N)
else:
index = ensure_index(index)
if columns is None:
columns = default_index(K)
else:
columns = ensure_index(columns)
if len(columns) != K:
raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
if len(index) != N:
raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
return index, columns

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,414 @@
"""Sparse Dtype"""
from __future__ import annotations
import re
from typing import (
TYPE_CHECKING,
Any,
)
import warnings
import numpy as np
from pandas._typing import (
Dtype,
DtypeObj,
type_t,
)
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.base import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)
if TYPE_CHECKING:
from pandas.core.arrays.sparse.array import SparseArray
@register_extension_dtype
class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.
This dtype implements the pandas ExtensionDtype interface.
Parameters
----------
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.
=========== ==========
dtype na_value
=========== ==========
float ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
=========== ==========
The default value may be overridden by specifying a `fill_value`.
Attributes
----------
None
Methods
-------
None
"""
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
_metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
if isinstance(dtype, type(self)):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype
dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype("object")
if fill_value is None:
fill_value = na_value_for_dtype(dtype)
self._dtype = dtype
self._fill_value = fill_value
self._check_fill_value()
def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
# __eq__, so we explicitly do it here.
return super().__hash__()
def __eq__(self, other: Any) -> bool:
# We have to override __eq__ to handle NA values in _metadata.
# The base class does simple == checks, which fail for NA.
if isinstance(other, str):
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
subtype = self.subtype == other.subtype
if self._is_na_fill_value:
# this case is complicated by two things:
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
other._is_na_fill_value
and isinstance(self.fill_value, type(other.fill_value))
or isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value
return subtype and fill_value
return False
@property
def fill_value(self):
"""
The fill value of the array.
Converting the SparseArray to a dense ndarray will fill the
array with this value.
.. warning::
It's possible to end up with a SparseArray that has ``fill_value``
values in ``sp_values``. This can occur, for example, when setting
``SparseArray.fill_value`` directly.
"""
return self._fill_value
def _check_fill_value(self):
if not is_scalar(self._fill_value):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
# TODO: Right now we can use Sparse boolean array
# with any fill_value. Here was an attempt
# to allow only 3 value: True, False or nan
# but plenty test has failed.
# see pull 44955
# if self._is_boolean and not (
# is_bool(self._fill_value) or isna(self._fill_value)
# ):
# raise ValueError(
# "fill_value must be True, False or nan "
# f"for boolean type. Got {self._fill_value} instead"
# )
@property
def _is_na_fill_value(self) -> bool:
return isna(self.fill_value)
@property
def _is_numeric(self) -> bool:
return not is_object_dtype(self.subtype)
@property
def _is_boolean(self) -> bool:
return is_bool_dtype(self.subtype)
@property
def kind(self):
"""
The sparse kind. Either 'integer', or 'block'.
"""
return self.subtype.kind
@property
def type(self):
return self.subtype.type
@property
def subtype(self):
return self._dtype
@property
def name(self):
return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
def __repr__(self) -> str:
return self.name
@classmethod
def construct_array_type(cls) -> type_t[SparseArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays.sparse.array import SparseArray
return SparseArray
@classmethod
def construct_from_string(cls, string: str) -> SparseDtype:
"""
Construct a SparseDtype from a string form.
Parameters
----------
string : str
Can take the following forms.
string dtype
================ ============================
'int' SparseDtype[np.int64, 0]
'Sparse' SparseDtype[np.float64, nan]
'Sparse[int]' SparseDtype[np.int64, 0]
'Sparse[int, 0]' SparseDtype[np.int64, 0]
================ ============================
It is not possible to specify non-default fill values
with a string. An argument like ``'Sparse[int, 1]'``
will raise a ``TypeError`` because the default fill value
for integers is 0.
Returns
-------
SparseDtype
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
msg = f"Cannot construct a 'SparseDtype' from '{string}'"
if string.startswith("Sparse"):
try:
sub_type, has_fill_value = cls._parse_subtype(string)
except ValueError as err:
raise TypeError(msg) from err
else:
result = SparseDtype(sub_type)
msg = (
f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
"looks like the fill_value in the string is not "
"the default for the dtype. Non-default fill_values "
"are not supported. Use the 'SparseDtype()' "
"constructor instead."
)
if has_fill_value and str(result) != string:
raise TypeError(msg)
return result
else:
raise TypeError(msg)
@staticmethod
def _parse_subtype(dtype: str) -> tuple[str, bool]:
"""
Parse a string to get the subtype
Parameters
----------
dtype : str
A string like
* Sparse[subtype]
* Sparse[subtype, fill_value]
Returns
-------
subtype : str
Raises
------
ValueError
When the subtype cannot be extracted.
"""
xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
m = xpr.match(dtype)
has_fill_value = False
if m:
subtype = m.groupdict()["subtype"]
has_fill_value = bool(m.groupdict()["fill_value"])
elif dtype == "Sparse":
subtype = "float64"
else:
raise ValueError(f"Cannot parse {dtype}")
return subtype, has_fill_value
@classmethod
def is_dtype(cls, dtype: object) -> bool:
dtype = getattr(dtype, "dtype", dtype)
if isinstance(dtype, str) and dtype.startswith("Sparse"):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
return isinstance(dtype, np.dtype) or dtype == "Sparse"
def update_dtype(self, dtype) -> SparseDtype:
"""
Convert the SparseDtype to a new dtype.
This takes care of converting the ``fill_value``.
Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.
* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.
Returns
-------
SparseDtype
A new SparseDtype with the correct `dtype` and fill value
for that `dtype`.
Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).
Examples
--------
>>> SparseDtype(int, 0).update_dtype(float)
Sparse[float64, 0.0]
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)
if not isinstance(dtype, cls):
if not isinstance(dtype, np.dtype):
raise TypeError("sparse arrays of extension dtypes not supported")
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
dtype = cls(dtype, fill_value=fill_value)
return dtype
@property
def _subtype_with_str(self):
"""
Whether the SparseDtype's subtype should be considered ``str``.
Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.
Returns
-------
>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')
>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')
>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')
>>> dtype._subtype_with_str
<class 'str'>
"""
if isinstance(self.fill_value, str):
return type(self.fill_value)
return self.subtype
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatible extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None
fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore the all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn(
"Concatenating sparse arrays with multiple fill "
f"values: '{fill_values}'. Picking the first and "
"converting the rest.",
PerformanceWarning,
stacklevel=find_stack_level(),
)
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)

View File

@@ -0,0 +1,211 @@
"""
Interaction with scipy.sparse matrices.
Currently only includes to_coo helpers.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Iterable,
)
import numpy as np
from pandas._libs import lib
from pandas._typing import (
IndexLabel,
npt,
)
from pandas.core.dtypes.missing import notna
from pandas.core.algorithms import factorize
from pandas.core.indexes.api import MultiIndex
from pandas.core.series import Series
if TYPE_CHECKING:
import scipy.sparse
def _check_is_partition(parts: Iterable, whole: Iterable):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
raise ValueError("Is not a partition because union is not the whole.")
def _levels_to_axis(
ss,
levels: tuple[int] | list[int],
valid_ilocs: npt.NDArray[np.intp],
sort_labels: bool = False,
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
"""
For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
where `ax_coords` are the coordinates along one of the two axes of the
destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
which correspond to these coordinates.
Parameters
----------
ss : Series
levels : tuple/list
valid_ilocs : numpy.ndarray
Array of integer positions of valid values for the sparse matrix in ss.
sort_labels : bool, default False
Sort the axis labels before forming the sparse matrix. When `levels`
refers to a single level, set to True for a faster execution.
Returns
-------
ax_coords : numpy.ndarray (axis coordinates)
ax_labels : list (axis labels)
"""
# Since the labels are sorted in `Index.levels`, when we wish to sort and
# there is only one level of the MultiIndex for this axis, the desired
# output can be obtained in the following simpler, more efficient way.
if sort_labels and len(levels) == 1:
ax_coords = ss.index.codes[levels[0]][valid_ilocs]
ax_labels = ss.index.levels[levels[0]]
else:
levels_values = lib.fast_zip(
[ss.index.get_level_values(lvl).values for lvl in levels]
)
codes, ax_labels = factorize(levels_values, sort=sort_labels)
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
return ax_coords, ax_labels
def _to_ijv(
ss,
row_levels: tuple[int] | list[int] = (0,),
column_levels: tuple[int] | list[int] = (1,),
sort_labels: bool = False,
) -> tuple[
np.ndarray,
npt.NDArray[np.intp],
npt.NDArray[np.intp],
list[IndexLabel],
list[IndexLabel],
]:
"""
For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
constructor, and ilabels and jlabels are the row and column labels
respectively.
Parameters
----------
ss : Series
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
When `row_levels` and/or `column_levels` refer to a single level,
set to `True` for a faster execution.
Returns
-------
values : numpy.ndarray
Valid values to populate a sparse matrix, extracted from
ss.
i_coords : numpy.ndarray (row coordinates of the values)
j_coords : numpy.ndarray (column coordinates of the values)
i_labels : list (row labels)
j_labels : list (column labels)
"""
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
# From the sparse Series, get the integer indices and data for valid sparse
# entries.
sp_vals = ss.array.sp_values
na_mask = notna(sp_vals)
values = sp_vals[na_mask]
valid_ilocs = ss.array.sp_index.indices[na_mask]
i_coords, i_labels = _levels_to_axis(
ss, row_levels, valid_ilocs, sort_labels=sort_labels
)
j_coords, j_labels = _levels_to_axis(
ss, column_levels, valid_ilocs, sort_labels=sort_labels
)
return values, i_coords, j_coords, i_labels, j_labels
def sparse_series_to_coo(
ss: Series,
row_levels: Iterable[int] = (0,),
column_levels: Iterable[int] = (1,),
sort_labels: bool = False,
) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
"""
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
import scipy.sparse
if ss.index.nlevels < 2:
raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
if not ss.index.is_unique:
raise ValueError(
"Duplicate index entries are not allowed in to_coo transformation."
)
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
v, i, j, rows, columns = _to_ijv(
ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
)
sparse_matrix = scipy.sparse.coo_matrix(
(v, (i, j)), shape=(len(rows), len(columns))
)
return sparse_matrix, rows, columns
def coo_to_sparse_series(
A: scipy.sparse.coo_matrix, dense_index: bool = False
) -> Series:
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
Returns
-------
Series
Raises
------
TypeError if A is not a coo_matrix
"""
from pandas import SparseDtype
try:
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
) from err
ser = ser.sort_index()
ser = ser.astype(SparseDtype(ser.dtype))
if dense_index:
# is there a better constructor method to use here?
i = range(A.shape[0])
j = range(A.shape[1])
ind = MultiIndex.from_product([i, j])
ser = ser.reindex(ind)
return ser