Skip to content

Commit

Permalink
(feat): Support for scipy.sparse.{csr,csc}_array (#1028)
Browse files Browse the repository at this point in the history
* Using faster random matrix generation in gen_adata

* Add csr_arrays to gen_adata

* Initial implementation

* Get concat working better with sparse_arrays

* Add sparray cases to helpers

* Add dask.tokenize for sparray classes, move to conftest

* Add sparray case to test_write_dispatched_chunks

* Add filled_with method for sparray for testing

* (chore): add test cases

* (feat): add compat wrappers for needed sparse array classes

* (feat): add `X` test cases

* (fix): merge test

* (feat): add dtype concatenation test

* (chore): add docs

* (feat): support backed sparse

* (fix): array and matrix typing in `append`

* (feat): add in mechanism for io, alway false

* (fix): condition on memory/backed

* (chore): remove comment

* (fix): mock class repr

* (fix): subsetting

* (chore): subsetting tests

* (fix): writing

* (fix): concatenation matrix/array class usage

* (fix): spec reading

* (fix): condition for `coo_array` in setting

* (fix): index type

* (fix): `sparray` usage

* (fix): writing of arrays

* (fix): more writing

* (fix): raw tests

* (chore): add overflow check class

* (chore): add sparray tests

* (fix): only test `sparray` if possible in `test_creation`

* (fix): remove unnecessary compat class

* (chore): fix xfail for sparse array in dask

* (chore): type hints for subsetting sparse

* (fix): base class check in `test_concatenate_roundtrip`

* (fix): indexing

* (fix): xfail cases

* (chore): remove unnecessary `CAN_USE_SPARSE_ARRAY`

* (fix): `h5` indexing check

* (fix): more xfail expectations

* (fix): skip `csr_array` for `test_append_overflow_check`

* (fix): `isinstance` -> `issubclass`

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* (refactor): `maybe_add_sparse_array` for `gen_adata` instead of conditionals

* (fix): import `contextmanager` in helpers

* preferred fix for dealing with backed arrays

* (refactor): use `new_array_type.__name__` for type check

* (refactor): gpu fixes, using request id and skipping bad dtype

* (fix): add coverage for setting by sparse

* Fix typo

* Ignore benchmark files

* Remove dask sparray stuff

* Apply suggestions from code review

Revert changes to backed sparse class hierarchy

* Remove tests case specializing on dask sparray

* simplify tests setting X

* normalize conditionals

---------

Co-authored-by: Ilan Gold <[email protected]>
Co-authored-by: Philipp A <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Apr 23, 2024
1 parent 0a768fc commit b7ca381
Show file tree
Hide file tree
Showing 22 changed files with 481 additions and 127 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ __pycache__/
.asv
benchmark/benchmarks/data
benchmarks/benchmarks/data
benchmarks/pkgs
1 change: 1 addition & 0 deletions docs/release-notes/0.11.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
```
* Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold`
* Add `remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold`
* `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup`

```{rubric} Bugfix
```
Expand Down
19 changes: 14 additions & 5 deletions src/anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
CupyArray,
CupySparseMatrix,
DaskArray,
SpArray,
ZappyArray,
ZarrArray,
_move_adj_mtx,
Expand Down Expand Up @@ -76,6 +77,7 @@ class StorageType(Enum):
CupyArray = CupyArray
CupySparseMatrix = CupySparseMatrix
BackedSparseMatrix = BaseCompressedSparseDataset
SparseArray = SpArray

@classmethod
def classes(cls):
Expand Down Expand Up @@ -511,7 +513,7 @@ def cs_to_bytes(X) -> int:
return int(np.array(X.shape).prod() * X.dtype.itemsize)
elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
return cs_to_bytes(X._to_backed())
elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)):
elif issparse(X):
return cs_to_bytes(X)
else:
return X.__sizeof__()
Expand Down Expand Up @@ -574,7 +576,7 @@ def shape(self) -> tuple[int, int]:
return self.n_obs, self.n_vars

@property
def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None:
def X(self) -> np.ndarray | sparse.spmatrix | SpArray | ArrayView | None:
"""Data matrix of shape :attr:`n_obs` × :attr:`n_vars`."""
if self.isbacked:
if not self.file.is_open:
Expand Down Expand Up @@ -605,7 +607,7 @@ def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None:
# return X

@X.setter
def X(self, value: np.ndarray | sparse.spmatrix | None):
def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None):
if value is None:
if self.isbacked:
raise NotImplementedError(
Expand Down Expand Up @@ -655,7 +657,11 @@ def X(self, value: np.ndarray | sparse.spmatrix | None):
if sparse.issparse(self._adata_ref._X) and isinstance(
value, np.ndarray
):
value = sparse.coo_matrix(value)
if isinstance(self._adata_ref.X, SpArray):
memory_class = sparse.coo_array
else:
memory_class = sparse.coo_matrix
value = memory_class(value)
self._adata_ref._X[oidx, vidx] = value
else:
self._X = value
Expand Down Expand Up @@ -1773,8 +1779,11 @@ def concatenate(

# Backwards compat (some of this could be more efficient)
# obs used to always be an outer join
sparse_class = sparse.csr_matrix
if any(isinstance(a.X, SpArray) for a in all_adatas):
sparse_class = sparse.csr_array
out.obs = concat(
[AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
[AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas],
axis=0,
join="outer",
label=batch_key,
Expand Down
15 changes: 9 additions & 6 deletions src/anndata/_core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix, issparse, spmatrix
from scipy.sparse import issparse, spmatrix

from ..compat import AwkArray, DaskArray, Index, Index1D
from ..compat import AwkArray, DaskArray, Index, Index1D, SpArray


def _normalize_indices(
Expand Down Expand Up @@ -71,12 +71,14 @@ def name_idx(i):
return indexer
elif isinstance(indexer, str):
return index.get_loc(indexer) # int
elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)):
elif isinstance(
indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix, SpArray)
):
if hasattr(indexer, "shape") and (
(indexer.shape == (index.shape[0], 1))
or (indexer.shape == (1, index.shape[0]))
):
if isinstance(indexer, spmatrix):
if isinstance(indexer, (spmatrix, SpArray)):
indexer = indexer.toarray()
indexer = np.ravel(indexer)
if not isinstance(indexer, (np.ndarray, pd.Index)):
Expand Down Expand Up @@ -148,14 +150,15 @@ def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
@_subset.register(DaskArray)
def _subset_dask(a: DaskArray, subset_idx: Index):
if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
if isinstance(a._meta, csc_matrix):
if issparse(a._meta) and a._meta.format == "csc":
return a[:, subset_idx[1]][subset_idx[0], :]
return a[subset_idx[0], :][:, subset_idx[1]]
return a[subset_idx]


@_subset.register(spmatrix)
def _subset_spmatrix(a: spmatrix, subset_idx: Index):
@_subset.register(SpArray)
def _subset_sparse(a: spmatrix | SpArray, subset_idx: Index):
# Correcting for indexing behaviour of sparse.spmatrix
if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
first_idx = subset_idx[0]
Expand Down
53 changes: 37 additions & 16 deletions src/anndata/_core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@
from anndata._warnings import ExperimentalFeatureWarning

from ..compat import (
CAN_USE_SPARSE_ARRAY,
AwkArray,
CupyArray,
CupyCSRMatrix,
CupySparseMatrix,
DaskArray,
SpArray,
_map_cat_to_str,
)
from ..utils import asarray, dim_len, warn_once
Expand Down Expand Up @@ -164,14 +166,15 @@ def equal_series(a, b) -> bool:


@equal.register(sparse.spmatrix)
@equal.register(SpArray)
@equal.register(CupySparseMatrix)
def equal_sparse(a, b) -> bool:
# It's a weird api, don't blame me
import array_api_compat

xp = array_api_compat.array_namespace(a.data)

if isinstance(b, (CupySparseMatrix, sparse.spmatrix)):
if isinstance(b, (CupySparseMatrix, sparse.spmatrix, SpArray)):
if isinstance(a, CupySparseMatrix):
# Comparison broken for CSC matrices
# https://github.com/cupy/cupy/issues/7757
Expand Down Expand Up @@ -202,8 +205,10 @@ def equal_awkward(a, b) -> bool:
return ak.almost_equal(a, b)


def as_sparse(x):
if not isinstance(x, sparse.spmatrix):
def as_sparse(x, use_sparse_array=False):
if not isinstance(x, (sparse.spmatrix, SpArray)):
if CAN_USE_SPARSE_ARRAY and use_sparse_array:
return sparse.csr_array(x)
return sparse.csr_matrix(x)
else:
return x
Expand Down Expand Up @@ -531,16 +536,14 @@ def apply(self, el, *, axis, fill_value=None):
return el
if isinstance(el, pd.DataFrame):
return self._apply_to_df(el, axis=axis, fill_value=fill_value)
elif isinstance(el, sparse.spmatrix):
elif isinstance(el, (sparse.spmatrix, SpArray, CupySparseMatrix)):
return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
elif isinstance(el, AwkArray):
return self._apply_to_awkward(el, axis=axis, fill_value=fill_value)
elif isinstance(el, DaskArray):
return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value)
elif isinstance(el, CupyArray):
return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value)
elif isinstance(el, CupySparseMatrix):
return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
else:
return self._apply_to_array(el, axis=axis, fill_value=fill_value)

Expand Down Expand Up @@ -610,7 +613,9 @@ def _apply_to_array(self, el, *, axis, fill_value=None):
el, indexer, axis=axis, allow_fill=True, fill_value=fill_value
)

def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
def _apply_to_sparse(
self, el: sparse.spmatrix | SpArray, *, axis, fill_value=None
) -> spmatrix:
if isinstance(el, CupySparseMatrix):
from cupyx.scipy import sparse
else:
Expand All @@ -632,7 +637,11 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
shape[axis] = len(self.new_idx)
shape = tuple(shape)
if fill_value == 0:
return sparse.csr_matrix(shape)
if isinstance(el, SpArray):
memory_class = sparse.csr_array
else:
memory_class = sparse.csr_matrix
return memory_class(shape)
else:
return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape))

Expand All @@ -642,9 +651,12 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype)
else:
idxmtx_dtype = bool

if isinstance(el, SpArray):
memory_class = sparse.coo_array
else:
memory_class = sparse.coo_matrix
if axis == 1:
idxmtx = sparse.coo_matrix(
idxmtx = memory_class(
(
xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
(xp.asarray(self.old_pos), xp.asarray(self.new_pos)),
Expand All @@ -658,7 +670,7 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
out = out.tocsc()
fill_idxer = (slice(None), to_fill)
elif axis == 0:
idxmtx = sparse.coo_matrix(
idxmtx = memory_class(
(
xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
(xp.asarray(self.new_pos), xp.asarray(self.old_pos)),
Expand Down Expand Up @@ -710,7 +722,7 @@ def default_fill_value(els):
This is largely due to backwards compat, and might not be the ideal solution.
"""
if any(isinstance(el, sparse.spmatrix) for el in els):
if any(isinstance(el, (sparse.spmatrix, SpArray)) for el in els):
return 0
else:
return np.nan
Expand Down Expand Up @@ -808,11 +820,16 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None):
],
axis=axis,
)
elif any(isinstance(a, sparse.spmatrix) for a in arrays):
elif any(isinstance(a, (sparse.spmatrix, SpArray)) for a in arrays):
sparse_stack = (sparse.vstack, sparse.hstack)[axis]
use_sparse_array = any(issubclass(type(a), SpArray) for a in arrays)
return sparse_stack(
[
f(as_sparse(a), axis=1 - axis, fill_value=fill_value)
f(
as_sparse(a, use_sparse_array=use_sparse_array),
axis=1 - axis,
fill_value=fill_value,
)
for f, a in zip(reindexers, arrays)
],
format="csr",
Expand Down Expand Up @@ -953,10 +970,14 @@ def concat_pairwise_mapping(
mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys
):
result = {}
if any(any(isinstance(v, SpArray) for v in m.values()) for m in mappings):
sparse_class = sparse.csr_array
else:
sparse_class = sparse.csr_matrix

for k in join_keys(mappings):
els = [
m.get(k, sparse.csr_matrix((s, s), dtype=bool))
for m, s in zip(mappings, shapes)
m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes)
]
if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els):
result[k] = _cp_block_diag(els, format="csr")
Expand Down
31 changes: 15 additions & 16 deletions src/anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from anndata._core.index import _fix_slice_bounds
from anndata.compat import H5Group, ZarrArray, ZarrGroup

from ..compat import _read_attr
from ..compat import SpArray, _read_attr

try:
# Not really important, just for IDEs to be more helpful
Expand Down Expand Up @@ -302,24 +302,23 @@ def mean_slice_length(slices):
return [], [], [0]


def get_format(data: ss.spmatrix) -> str:
for fmt, _, memory_class in FORMATS:
if isinstance(data, memory_class):
return fmt
raise ValueError(f"Data type {type(data)} is not supported.")


def get_memory_class(format: str) -> type[ss.spmatrix]:
def get_memory_class(format: str, use_sparray_in_io=False) -> type[ss.spmatrix]:
for fmt, _, memory_class in FORMATS:
if format == fmt:
return memory_class
if use_sparray_in_io and issubclass(memory_class, SpArray):
return memory_class
elif not use_sparray_in_io and issubclass(memory_class, ss.spmatrix):
return memory_class
raise ValueError(f"Format string {format} is not supported.")


def get_backed_class(format: str) -> type[BackedSparseMatrix]:
def get_backed_class(format: str, use_sparray_in_io=False) -> type[BackedSparseMatrix]:
for fmt, backed_class, _ in FORMATS:
if format == fmt:
return backed_class
if use_sparray_in_io and issubclass(backed_class, SpArray):
return backed_class
elif not use_sparray_in_io and issubclass(backed_class, ss.spmatrix):
return backed_class
raise ValueError(f"Format string {format} is not supported.")


Expand Down Expand Up @@ -471,14 +470,14 @@ def __setitem__(self, index: Index | tuple[()], value):
mock_matrix[row, col] = value

# TODO: split to other classes?
def append(self, sparse_matrix: ss.spmatrix):
def append(self, sparse_matrix: ss.spmatrix | SpArray):
# Prep variables
shape = self.shape
if isinstance(sparse_matrix, BaseCompressedSparseDataset):
sparse_matrix = sparse_matrix._to_backed()

# Check input
if not ss.isspmatrix(sparse_matrix):
if not ss.issparse(sparse_matrix):
raise NotImplementedError(
"Currently, only sparse matrices of equivalent format can be "
"appended to a SparseDataset."
Expand All @@ -487,10 +486,10 @@ def append(self, sparse_matrix: ss.spmatrix):
raise NotImplementedError(
f"The append method for format {self.format} " f"is not implemented."
)
if self.format != get_format(sparse_matrix):
if self.format != sparse_matrix.format:
raise ValueError(
f"Matrices must have same format. Currently are "
f"{self.format!r} and {get_format(sparse_matrix)!r}"
f"{self.format!r} and {sparse_matrix.format!r}"
)
indptr_offset = len(self.group["indices"])
if self.group["indptr"].dtype == np.int32:
Expand Down
Loading

0 comments on commit b7ca381

Please sign in to comment.