(feat): Support for scipy.sparse.{csr,csc}_array (#1028)

* Using faster random matrix generation in gen_adata * Add csr_arrays to gen_adata * Initial implementation * Get concat working better with sparse_arrays * Add sparray cases to helpers * Add dask.tokenize for sparray classes, move to conftest * Add sparray case to test_write_dispatched_chunks * Add filled_with method for sparray for testing * (chore): add test cases * (feat): add compat wrappers for needed sparse array classes * (feat): add `X` test cases * (fix): merge test * (feat): add dtype concatenation test * (chore): add docs * (feat): support backed sparse * (fix): array and matrix typing in `append` * (feat): add in mechanism for io, alway false * (fix): condition on memory/backed * (chore): remove comment * (fix): mock class repr * (fix): subsetting * (chore): subsetting tests * (fix): writing * (fix): concatenation matrix/array class usage * (fix): spec reading * (fix): condition for `coo_array` in setting * (fix): index type * (fix): `sparray` usage * (fix): writing of arrays * (fix): more writing * (fix): raw tests * (chore): add overflow check class * (chore): add sparray tests * (fix): only test `sparray` if possible in `test_creation` * (fix): remove unnecessary compat class * (chore): fix xfail for sparse array in dask * (chore): type hints for subsetting sparse * (fix): base class check in `test_concatenate_roundtrip` * (fix): indexing * (fix): xfail cases * (chore): remove unnecessary `CAN_USE_SPARSE_ARRAY` * (fix): `h5` indexing check * (fix): more xfail expectations * (fix): skip `csr_array` for `test_append_overflow_check` * (fix): `isinstance` -> `issubclass` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * (refactor): `maybe_add_sparse_array` for `gen_adata` instead of conditionals * (fix): import `contextmanager` in helpers * preferred fix for dealing with backed arrays * (refactor): use `new_array_type.__name__` for type check * (refactor): gpu fixes, using request id and skipping bad dtype * (fix): add coverage for setting by sparse * Fix typo * Ignore benchmark files * Remove dask sparray stuff * Apply suggestions from code review Revert changes to backed sparse class hierarchy * Remove tests case specializing on dask sparray * simplify tests setting X * normalize conditionals --------- Co-authored-by: Ilan Gold <[email protected]> Co-authored-by: Philipp A <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scverse · Apr 23, 2024 · b7ca381 · b7ca381
1 parent 0a768fc
commit b7ca381
Show file tree

Hide file tree

Showing 22 changed files with 481 additions and 127 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,4 @@ __pycache__/
 .asv
 benchmark/benchmarks/data
 benchmarks/benchmarks/data
+benchmarks/pkgs
diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md
@@ -4,6 +4,7 @@
 ```
 * Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold`
 * Add `remove_unused_categories` option to `anndata.settings` to override current behavior.  Default is `True` (i.e., previous behavior).  Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage.  {pr}`1340` {user}`ilan-gold`
+* `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup`
 
 ```{rubric} Bugfix
 ```

diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py
@@ -34,6 +34,7 @@
     CupyArray,
     CupySparseMatrix,
     DaskArray,
+    SpArray,
     ZappyArray,
     ZarrArray,
     _move_adj_mtx,
@@ -76,6 +77,7 @@ class StorageType(Enum):
     CupyArray = CupyArray
     CupySparseMatrix = CupySparseMatrix
     BackedSparseMatrix = BaseCompressedSparseDataset
+    SparseArray = SpArray
 
     @classmethod
     def classes(cls):
@@ -511,7 +513,7 @@ def cs_to_bytes(X) -> int:
                 return int(np.array(X.shape).prod() * X.dtype.itemsize)
             elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
                 return cs_to_bytes(X._to_backed())
-            elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)):
+            elif issparse(X):
                 return cs_to_bytes(X)
             else:
                 return X.__sizeof__()
@@ -574,7 +576,7 @@ def shape(self) -> tuple[int, int]:
         return self.n_obs, self.n_vars
 
     @property
-    def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None:
+    def X(self) -> np.ndarray | sparse.spmatrix | SpArray | ArrayView | None:
         """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`."""
         if self.isbacked:
             if not self.file.is_open:
@@ -605,7 +607,7 @@ def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None:
         #     return X
 
     @X.setter
-    def X(self, value: np.ndarray | sparse.spmatrix | None):
+    def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None):
         if value is None:
             if self.isbacked:
                 raise NotImplementedError(
@@ -655,7 +657,11 @@ def X(self, value: np.ndarray | sparse.spmatrix | None):
                     if sparse.issparse(self._adata_ref._X) and isinstance(
                         value, np.ndarray
                     ):
-                        value = sparse.coo_matrix(value)
+                        if isinstance(self._adata_ref.X, SpArray):
+                            memory_class = sparse.coo_array
+                        else:
+                            memory_class = sparse.coo_matrix
+                        value = memory_class(value)
                     self._adata_ref._X[oidx, vidx] = value
                 else:
                     self._X = value
@@ -1773,8 +1779,11 @@ def concatenate(
 
         # Backwards compat (some of this could be more efficient)
         # obs used to always be an outer join
+        sparse_class = sparse.csr_matrix
+        if any(isinstance(a.X, SpArray) for a in all_adatas):
+            sparse_class = sparse.csr_array
         out.obs = concat(
-            [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
+            [AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas],
             axis=0,
             join="outer",
             label=batch_key,

diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py
@@ -8,9 +8,9 @@
 import h5py
 import numpy as np
 import pandas as pd
-from scipy.sparse import csc_matrix, issparse, spmatrix
+from scipy.sparse import issparse, spmatrix
 
-from ..compat import AwkArray, DaskArray, Index, Index1D
+from ..compat import AwkArray, DaskArray, Index, Index1D, SpArray
 
 
 def _normalize_indices(
@@ -71,12 +71,14 @@ def name_idx(i):
         return indexer
     elif isinstance(indexer, str):
         return index.get_loc(indexer)  # int
-    elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)):
+    elif isinstance(
+        indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix, SpArray)
+    ):
         if hasattr(indexer, "shape") and (
             (indexer.shape == (index.shape[0], 1))
             or (indexer.shape == (1, index.shape[0]))
         ):
-            if isinstance(indexer, spmatrix):
+            if isinstance(indexer, (spmatrix, SpArray)):
                 indexer = indexer.toarray()
             indexer = np.ravel(indexer)
         if not isinstance(indexer, (np.ndarray, pd.Index)):
@@ -148,14 +150,15 @@ def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
 @_subset.register(DaskArray)
 def _subset_dask(a: DaskArray, subset_idx: Index):
     if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
-        if isinstance(a._meta, csc_matrix):
+        if issparse(a._meta) and a._meta.format == "csc":
             return a[:, subset_idx[1]][subset_idx[0], :]
         return a[subset_idx[0], :][:, subset_idx[1]]
     return a[subset_idx]
 
 
 @_subset.register(spmatrix)
-def _subset_spmatrix(a: spmatrix, subset_idx: Index):
+@_subset.register(SpArray)
+def _subset_sparse(a: spmatrix | SpArray, subset_idx: Index):
     # Correcting for indexing behaviour of sparse.spmatrix
     if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
         first_idx = subset_idx[0]

diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py
@@ -29,11 +29,13 @@
 from anndata._warnings import ExperimentalFeatureWarning
 
 from ..compat import (
+    CAN_USE_SPARSE_ARRAY,
     AwkArray,
     CupyArray,
     CupyCSRMatrix,
     CupySparseMatrix,
     DaskArray,
+    SpArray,
     _map_cat_to_str,
 )
 from ..utils import asarray, dim_len, warn_once
@@ -164,14 +166,15 @@ def equal_series(a, b) -> bool:
 
 
 @equal.register(sparse.spmatrix)
+@equal.register(SpArray)
 @equal.register(CupySparseMatrix)
 def equal_sparse(a, b) -> bool:
     # It's a weird api, don't blame me
     import array_api_compat
 
     xp = array_api_compat.array_namespace(a.data)
 
-    if isinstance(b, (CupySparseMatrix, sparse.spmatrix)):
+    if isinstance(b, (CupySparseMatrix, sparse.spmatrix, SpArray)):
         if isinstance(a, CupySparseMatrix):
             # Comparison broken for CSC matrices
             # https://github.com/cupy/cupy/issues/7757
@@ -202,8 +205,10 @@ def equal_awkward(a, b) -> bool:
     return ak.almost_equal(a, b)
 
 
-def as_sparse(x):
-    if not isinstance(x, sparse.spmatrix):
+def as_sparse(x, use_sparse_array=False):
+    if not isinstance(x, (sparse.spmatrix, SpArray)):
+        if CAN_USE_SPARSE_ARRAY and use_sparse_array:
+            return sparse.csr_array(x)
         return sparse.csr_matrix(x)
     else:
         return x
@@ -531,16 +536,14 @@ def apply(self, el, *, axis, fill_value=None):
             return el
         if isinstance(el, pd.DataFrame):
             return self._apply_to_df(el, axis=axis, fill_value=fill_value)
-        elif isinstance(el, sparse.spmatrix):
+        elif isinstance(el, (sparse.spmatrix, SpArray, CupySparseMatrix)):
             return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, AwkArray):
             return self._apply_to_awkward(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, DaskArray):
             return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, CupyArray):
             return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value)
-        elif isinstance(el, CupySparseMatrix):
-            return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
         else:
             return self._apply_to_array(el, axis=axis, fill_value=fill_value)
 
@@ -610,7 +613,9 @@ def _apply_to_array(self, el, *, axis, fill_value=None):
             el, indexer, axis=axis, allow_fill=True, fill_value=fill_value
         )
 
-    def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
+    def _apply_to_sparse(
+        self, el: sparse.spmatrix | SpArray, *, axis, fill_value=None
+    ) -> spmatrix:
         if isinstance(el, CupySparseMatrix):
             from cupyx.scipy import sparse
         else:
@@ -632,7 +637,11 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
             shape[axis] = len(self.new_idx)
             shape = tuple(shape)
             if fill_value == 0:
-                return sparse.csr_matrix(shape)
+                if isinstance(el, SpArray):
+                    memory_class = sparse.csr_array
+                else:
+                    memory_class = sparse.csr_matrix
+                return memory_class(shape)
             else:
                 return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape))
 
@@ -642,9 +651,12 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
             idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype)
         else:
             idxmtx_dtype = bool
-
+        if isinstance(el, SpArray):
+            memory_class = sparse.coo_array
+        else:
+            memory_class = sparse.coo_matrix
         if axis == 1:
-            idxmtx = sparse.coo_matrix(
+            idxmtx = memory_class(
                 (
                     xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
                     (xp.asarray(self.old_pos), xp.asarray(self.new_pos)),
@@ -658,7 +670,7 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
                 out = out.tocsc()
                 fill_idxer = (slice(None), to_fill)
         elif axis == 0:
-            idxmtx = sparse.coo_matrix(
+            idxmtx = memory_class(
                 (
                     xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
                     (xp.asarray(self.new_pos), xp.asarray(self.old_pos)),
@@ -710,7 +722,7 @@ def default_fill_value(els):
 
     This is largely due to backwards compat, and might not be the ideal solution.
     """
-    if any(isinstance(el, sparse.spmatrix) for el in els):
+    if any(isinstance(el, (sparse.spmatrix, SpArray)) for el in els):
         return 0
     else:
         return np.nan
@@ -808,11 +820,16 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None):
             ],
             axis=axis,
         )
-    elif any(isinstance(a, sparse.spmatrix) for a in arrays):
+    elif any(isinstance(a, (sparse.spmatrix, SpArray)) for a in arrays):
         sparse_stack = (sparse.vstack, sparse.hstack)[axis]
+        use_sparse_array = any(issubclass(type(a), SpArray) for a in arrays)
         return sparse_stack(
             [
-                f(as_sparse(a), axis=1 - axis, fill_value=fill_value)
+                f(
+                    as_sparse(a, use_sparse_array=use_sparse_array),
+                    axis=1 - axis,
+                    fill_value=fill_value,
+                )
                 for f, a in zip(reindexers, arrays)
             ],
             format="csr",
@@ -953,10 +970,14 @@ def concat_pairwise_mapping(
     mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys
 ):
     result = {}
+    if any(any(isinstance(v, SpArray) for v in m.values()) for m in mappings):
+        sparse_class = sparse.csr_array
+    else:
+        sparse_class = sparse.csr_matrix
+
     for k in join_keys(mappings):
         els = [
-            m.get(k, sparse.csr_matrix((s, s), dtype=bool))
-            for m, s in zip(mappings, shapes)
+            m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes)
         ]
         if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els):
             result[k] = _cp_block_diag(els, format="csr")

diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py
@@ -29,7 +29,7 @@
 from anndata._core.index import _fix_slice_bounds
 from anndata.compat import H5Group, ZarrArray, ZarrGroup
 
-from ..compat import _read_attr
+from ..compat import SpArray, _read_attr
 
 try:
     # Not really important, just for IDEs to be more helpful
@@ -302,24 +302,23 @@ def mean_slice_length(slices):
     return [], [], [0]
 
 
-def get_format(data: ss.spmatrix) -> str:
-    for fmt, _, memory_class in FORMATS:
-        if isinstance(data, memory_class):
-            return fmt
-    raise ValueError(f"Data type {type(data)} is not supported.")
-
-
-def get_memory_class(format: str) -> type[ss.spmatrix]:
+def get_memory_class(format: str, use_sparray_in_io=False) -> type[ss.spmatrix]:
     for fmt, _, memory_class in FORMATS:
         if format == fmt:
-            return memory_class
+            if use_sparray_in_io and issubclass(memory_class, SpArray):
+                return memory_class
+            elif not use_sparray_in_io and issubclass(memory_class, ss.spmatrix):
+                return memory_class
     raise ValueError(f"Format string {format} is not supported.")
 
 
-def get_backed_class(format: str) -> type[BackedSparseMatrix]:
+def get_backed_class(format: str, use_sparray_in_io=False) -> type[BackedSparseMatrix]:
     for fmt, backed_class, _ in FORMATS:
         if format == fmt:
-            return backed_class
+            if use_sparray_in_io and issubclass(backed_class, SpArray):
+                return backed_class
+            elif not use_sparray_in_io and issubclass(backed_class, ss.spmatrix):
+                return backed_class
     raise ValueError(f"Format string {format} is not supported.")
 
 
@@ -471,14 +470,14 @@ def __setitem__(self, index: Index | tuple[()], value):
         mock_matrix[row, col] = value
 
     # TODO: split to other classes?
-    def append(self, sparse_matrix: ss.spmatrix):
+    def append(self, sparse_matrix: ss.spmatrix | SpArray):
         # Prep variables
         shape = self.shape
         if isinstance(sparse_matrix, BaseCompressedSparseDataset):
             sparse_matrix = sparse_matrix._to_backed()
 
         # Check input
-        if not ss.isspmatrix(sparse_matrix):
+        if not ss.issparse(sparse_matrix):
             raise NotImplementedError(
                 "Currently, only sparse matrices of equivalent format can be "
                 "appended to a SparseDataset."
@@ -487,10 +486,10 @@ def append(self, sparse_matrix: ss.spmatrix):
             raise NotImplementedError(
                 f"The append method for format {self.format} " f"is not implemented."
             )
-        if self.format != get_format(sparse_matrix):
+        if self.format != sparse_matrix.format:
             raise ValueError(
                 f"Matrices must have same format. Currently are "
-                f"{self.format!r} and {get_format(sparse_matrix)!r}"
+                f"{self.format!r} and {sparse_matrix.format!r}"
             )
         indptr_offset = len(self.group["indices"])
         if self.group["indptr"].dtype == np.int32: