From b7ca381bc4a6b9c0c3b29159065afc52b982d6c2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 24 Apr 2024 00:21:48 +1000 Subject: [PATCH] (feat): Support for scipy.sparse.{csr,csc}_array (#1028) * Using faster random matrix generation in gen_adata * Add csr_arrays to gen_adata * Initial implementation * Get concat working better with sparse_arrays * Add sparray cases to helpers * Add dask.tokenize for sparray classes, move to conftest * Add sparray case to test_write_dispatched_chunks * Add filled_with method for sparray for testing * (chore): add test cases * (feat): add compat wrappers for needed sparse array classes * (feat): add `X` test cases * (fix): merge test * (feat): add dtype concatenation test * (chore): add docs * (feat): support backed sparse * (fix): array and matrix typing in `append` * (feat): add in mechanism for io, alway false * (fix): condition on memory/backed * (chore): remove comment * (fix): mock class repr * (fix): subsetting * (chore): subsetting tests * (fix): writing * (fix): concatenation matrix/array class usage * (fix): spec reading * (fix): condition for `coo_array` in setting * (fix): index type * (fix): `sparray` usage * (fix): writing of arrays * (fix): more writing * (fix): raw tests * (chore): add overflow check class * (chore): add sparray tests * (fix): only test `sparray` if possible in `test_creation` * (fix): remove unnecessary compat class * (chore): fix xfail for sparse array in dask * (chore): type hints for subsetting sparse * (fix): base class check in `test_concatenate_roundtrip` * (fix): indexing * (fix): xfail cases * (chore): remove unnecessary `CAN_USE_SPARSE_ARRAY` * (fix): `h5` indexing check * (fix): more xfail expectations * (fix): skip `csr_array` for `test_append_overflow_check` * (fix): `isinstance` -> `issubclass` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * (refactor): `maybe_add_sparse_array` for `gen_adata` instead of conditionals * (fix): import `contextmanager` in helpers * preferred fix for dealing with backed arrays * (refactor): use `new_array_type.__name__` for type check * (refactor): gpu fixes, using request id and skipping bad dtype * (fix): add coverage for setting by sparse * Fix typo * Ignore benchmark files * Remove dask sparray stuff * Apply suggestions from code review Revert changes to backed sparse class hierarchy * Remove tests case specializing on dask sparray * simplify tests setting X * normalize conditionals --------- Co-authored-by: Ilan Gold Co-authored-by: Philipp A Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .gitignore | 1 + docs/release-notes/0.11.0.md | 1 + src/anndata/_core/anndata.py | 19 ++- src/anndata/_core/index.py | 15 +- src/anndata/_core/merge.py | 53 ++++--- src/anndata/_core/sparse_dataset.py | 31 ++--- src/anndata/_core/views.py | 36 ++++- src/anndata/_io/specs/methods.py | 11 +- src/anndata/compat/__init__.py | 27 +++- src/anndata/tests/conftest.py | 53 ++++++- src/anndata/tests/helpers.py | 152 ++++++++++++++++++--- src/anndata/tests/test_backed_sparse.py | 12 +- src/anndata/tests/test_base.py | 3 + src/anndata/tests/test_concatenate.py | 51 ++++++- src/anndata/tests/test_concatenate_disk.py | 6 +- src/anndata/tests/test_hdf5_backing.py | 6 +- src/anndata/tests/test_inplace_subset.py | 18 ++- src/anndata/tests/test_io_dispatched.py | 3 +- src/anndata/tests/test_readwrite.py | 24 ++-- src/anndata/tests/test_views.py | 62 ++++++--- src/anndata/tests/test_x.py | 21 ++- src/anndata/utils.py | 3 +- 22 files changed, 481 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index e5a22ec71..51533016e 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ __pycache__/ .asv benchmark/benchmarks/data benchmarks/benchmarks/data +benchmarks/pkgs diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index 220e176c4..84ec3851b 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -4,6 +4,7 @@ ``` * Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold` * Add `remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold` +* `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` ```{rubric} Bugfix ``` diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 2a02475bc..9184fa9c9 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -34,6 +34,7 @@ CupyArray, CupySparseMatrix, DaskArray, + SpArray, ZappyArray, ZarrArray, _move_adj_mtx, @@ -76,6 +77,7 @@ class StorageType(Enum): CupyArray = CupyArray CupySparseMatrix = CupySparseMatrix BackedSparseMatrix = BaseCompressedSparseDataset + SparseArray = SpArray @classmethod def classes(cls): @@ -511,7 +513,7 @@ def cs_to_bytes(X) -> int: return int(np.array(X.shape).prod() * X.dtype.itemsize) elif isinstance(X, BaseCompressedSparseDataset) and with_disk: return cs_to_bytes(X._to_backed()) - elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)): + elif issparse(X): return cs_to_bytes(X) else: return X.__sizeof__() @@ -574,7 +576,7 @@ def shape(self) -> tuple[int, int]: return self.n_obs, self.n_vars @property - def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None: + def X(self) -> np.ndarray | sparse.spmatrix | SpArray | ArrayView | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" if self.isbacked: if not self.file.is_open: @@ -605,7 +607,7 @@ def X(self) -> np.ndarray | sparse.spmatrix | ArrayView | None: # return X @X.setter - def X(self, value: np.ndarray | sparse.spmatrix | None): + def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None): if value is None: if self.isbacked: raise NotImplementedError( @@ -655,7 +657,11 @@ def X(self, value: np.ndarray | sparse.spmatrix | None): if sparse.issparse(self._adata_ref._X) and isinstance( value, np.ndarray ): - value = sparse.coo_matrix(value) + if isinstance(self._adata_ref.X, SpArray): + memory_class = sparse.coo_array + else: + memory_class = sparse.coo_matrix + value = memory_class(value) self._adata_ref._X[oidx, vidx] = value else: self._X = value @@ -1773,8 +1779,11 @@ def concatenate( # Backwards compat (some of this could be more efficient) # obs used to always be an outer join + sparse_class = sparse.csr_matrix + if any(isinstance(a.X, SpArray) for a in all_adatas): + sparse_class = sparse.csr_array out.obs = concat( - [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], + [AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas], axis=0, join="outer", label=batch_key, diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 38c2cdede..38cbf4788 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -8,9 +8,9 @@ import h5py import numpy as np import pandas as pd -from scipy.sparse import csc_matrix, issparse, spmatrix +from scipy.sparse import issparse, spmatrix -from ..compat import AwkArray, DaskArray, Index, Index1D +from ..compat import AwkArray, DaskArray, Index, Index1D, SpArray def _normalize_indices( @@ -71,12 +71,14 @@ def name_idx(i): return indexer elif isinstance(indexer, str): return index.get_loc(indexer) # int - elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)): + elif isinstance( + indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix, SpArray) + ): if hasattr(indexer, "shape") and ( (indexer.shape == (index.shape[0], 1)) or (indexer.shape == (1, index.shape[0])) ): - if isinstance(indexer, spmatrix): + if isinstance(indexer, (spmatrix, SpArray)): indexer = indexer.toarray() indexer = np.ravel(indexer) if not isinstance(indexer, (np.ndarray, pd.Index)): @@ -148,14 +150,15 @@ def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index): @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx): - if isinstance(a._meta, csc_matrix): + if issparse(a._meta) and a._meta.format == "csc": return a[:, subset_idx[1]][subset_idx[0], :] return a[subset_idx[0], :][:, subset_idx[1]] return a[subset_idx] @_subset.register(spmatrix) -def _subset_spmatrix(a: spmatrix, subset_idx: Index): +@_subset.register(SpArray) +def _subset_sparse(a: spmatrix | SpArray, subset_idx: Index): # Correcting for indexing behaviour of sparse.spmatrix if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx): first_idx = subset_idx[0] diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index d326d0ac2..df6265c2d 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -29,11 +29,13 @@ from anndata._warnings import ExperimentalFeatureWarning from ..compat import ( + CAN_USE_SPARSE_ARRAY, AwkArray, CupyArray, CupyCSRMatrix, CupySparseMatrix, DaskArray, + SpArray, _map_cat_to_str, ) from ..utils import asarray, dim_len, warn_once @@ -164,6 +166,7 @@ def equal_series(a, b) -> bool: @equal.register(sparse.spmatrix) +@equal.register(SpArray) @equal.register(CupySparseMatrix) def equal_sparse(a, b) -> bool: # It's a weird api, don't blame me @@ -171,7 +174,7 @@ def equal_sparse(a, b) -> bool: xp = array_api_compat.array_namespace(a.data) - if isinstance(b, (CupySparseMatrix, sparse.spmatrix)): + if isinstance(b, (CupySparseMatrix, sparse.spmatrix, SpArray)): if isinstance(a, CupySparseMatrix): # Comparison broken for CSC matrices # https://github.com/cupy/cupy/issues/7757 @@ -202,8 +205,10 @@ def equal_awkward(a, b) -> bool: return ak.almost_equal(a, b) -def as_sparse(x): - if not isinstance(x, sparse.spmatrix): +def as_sparse(x, use_sparse_array=False): + if not isinstance(x, (sparse.spmatrix, SpArray)): + if CAN_USE_SPARSE_ARRAY and use_sparse_array: + return sparse.csr_array(x) return sparse.csr_matrix(x) else: return x @@ -531,7 +536,7 @@ def apply(self, el, *, axis, fill_value=None): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) - elif isinstance(el, sparse.spmatrix): + elif isinstance(el, (sparse.spmatrix, SpArray, CupySparseMatrix)): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) elif isinstance(el, AwkArray): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) @@ -539,8 +544,6 @@ def apply(self, el, *, axis, fill_value=None): return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value) elif isinstance(el, CupyArray): return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value) - elif isinstance(el, CupySparseMatrix): - return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) @@ -610,7 +613,9 @@ def _apply_to_array(self, el, *, axis, fill_value=None): el, indexer, axis=axis, allow_fill=True, fill_value=fill_value ) - def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: + def _apply_to_sparse( + self, el: sparse.spmatrix | SpArray, *, axis, fill_value=None + ) -> spmatrix: if isinstance(el, CupySparseMatrix): from cupyx.scipy import sparse else: @@ -632,7 +637,11 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: shape[axis] = len(self.new_idx) shape = tuple(shape) if fill_value == 0: - return sparse.csr_matrix(shape) + if isinstance(el, SpArray): + memory_class = sparse.csr_array + else: + memory_class = sparse.csr_matrix + return memory_class(shape) else: return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape)) @@ -642,9 +651,12 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype) else: idxmtx_dtype = bool - + if isinstance(el, SpArray): + memory_class = sparse.coo_array + else: + memory_class = sparse.coo_matrix if axis == 1: - idxmtx = sparse.coo_matrix( + idxmtx = memory_class( ( xp.ones(len(self.new_pos), dtype=idxmtx_dtype), (xp.asarray(self.old_pos), xp.asarray(self.new_pos)), @@ -658,7 +670,7 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: out = out.tocsc() fill_idxer = (slice(None), to_fill) elif axis == 0: - idxmtx = sparse.coo_matrix( + idxmtx = memory_class( ( xp.ones(len(self.new_pos), dtype=idxmtx_dtype), (xp.asarray(self.new_pos), xp.asarray(self.old_pos)), @@ -710,7 +722,7 @@ def default_fill_value(els): This is largely due to backwards compat, and might not be the ideal solution. """ - if any(isinstance(el, sparse.spmatrix) for el in els): + if any(isinstance(el, (sparse.spmatrix, SpArray)) for el in els): return 0 else: return np.nan @@ -808,11 +820,16 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], axis=axis, ) - elif any(isinstance(a, sparse.spmatrix) for a in arrays): + elif any(isinstance(a, (sparse.spmatrix, SpArray)) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] + use_sparse_array = any(issubclass(type(a), SpArray) for a in arrays) return sparse_stack( [ - f(as_sparse(a), axis=1 - axis, fill_value=fill_value) + f( + as_sparse(a, use_sparse_array=use_sparse_array), + axis=1 - axis, + fill_value=fill_value, + ) for f, a in zip(reindexers, arrays) ], format="csr", @@ -953,10 +970,14 @@ def concat_pairwise_mapping( mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys ): result = {} + if any(any(isinstance(v, SpArray) for v in m.values()) for m in mappings): + sparse_class = sparse.csr_array + else: + sparse_class = sparse.csr_matrix + for k in join_keys(mappings): els = [ - m.get(k, sparse.csr_matrix((s, s), dtype=bool)) - for m, s in zip(mappings, shapes) + m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els): result[k] = _cp_block_diag(els, format="csr") diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 465a0a238..f77d478e6 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -29,7 +29,7 @@ from anndata._core.index import _fix_slice_bounds from anndata.compat import H5Group, ZarrArray, ZarrGroup -from ..compat import _read_attr +from ..compat import SpArray, _read_attr try: # Not really important, just for IDEs to be more helpful @@ -302,24 +302,23 @@ def mean_slice_length(slices): return [], [], [0] -def get_format(data: ss.spmatrix) -> str: - for fmt, _, memory_class in FORMATS: - if isinstance(data, memory_class): - return fmt - raise ValueError(f"Data type {type(data)} is not supported.") - - -def get_memory_class(format: str) -> type[ss.spmatrix]: +def get_memory_class(format: str, use_sparray_in_io=False) -> type[ss.spmatrix]: for fmt, _, memory_class in FORMATS: if format == fmt: - return memory_class + if use_sparray_in_io and issubclass(memory_class, SpArray): + return memory_class + elif not use_sparray_in_io and issubclass(memory_class, ss.spmatrix): + return memory_class raise ValueError(f"Format string {format} is not supported.") -def get_backed_class(format: str) -> type[BackedSparseMatrix]: +def get_backed_class(format: str, use_sparray_in_io=False) -> type[BackedSparseMatrix]: for fmt, backed_class, _ in FORMATS: if format == fmt: - return backed_class + if use_sparray_in_io and issubclass(backed_class, SpArray): + return backed_class + elif not use_sparray_in_io and issubclass(backed_class, ss.spmatrix): + return backed_class raise ValueError(f"Format string {format} is not supported.") @@ -471,14 +470,14 @@ def __setitem__(self, index: Index | tuple[()], value): mock_matrix[row, col] = value # TODO: split to other classes? - def append(self, sparse_matrix: ss.spmatrix): + def append(self, sparse_matrix: ss.spmatrix | SpArray): # Prep variables shape = self.shape if isinstance(sparse_matrix, BaseCompressedSparseDataset): sparse_matrix = sparse_matrix._to_backed() # Check input - if not ss.isspmatrix(sparse_matrix): + if not ss.issparse(sparse_matrix): raise NotImplementedError( "Currently, only sparse matrices of equivalent format can be " "appended to a SparseDataset." @@ -487,10 +486,10 @@ def append(self, sparse_matrix: ss.spmatrix): raise NotImplementedError( f"The append method for format {self.format} " f"is not implemented." ) - if self.format != get_format(sparse_matrix): + if self.format != sparse_matrix.format: raise ValueError( f"Matrices must have same format. Currently are " - f"{self.format!r} and {get_format(sparse_matrix)!r}" + f"{self.format!r} and {sparse_matrix.format!r}" ) indptr_offset = len(self.group["indices"]) if self.group["indptr"].dtype == np.int32: diff --git a/src/anndata/_core/views.py b/src/anndata/_core/views.py index ce86a27ee..cb9201e03 100644 --- a/src/anndata/_core/views.py +++ b/src/anndata/_core/views.py @@ -202,20 +202,32 @@ def keys(self) -> KeysView[str]: return self.dtype.names -# Unlike array views, SparseCSRView and SparseCSCView +# Unlike array views, SparseCSRMatrixView and SparseCSCMatrixView # do not propagate through subsetting -class SparseCSRView(_ViewMixin, sparse.csr_matrix): +class SparseCSRMatrixView(_ViewMixin, sparse.csr_matrix): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csr_matrix: return sparse.csr_matrix(self).copy() -class SparseCSCView(_ViewMixin, sparse.csc_matrix): +class SparseCSCMatrixView(_ViewMixin, sparse.csc_matrix): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csc_matrix: return sparse.csc_matrix(self).copy() +class SparseCSRArrayView(_ViewMixin, sparse.csr_array): + # https://github.com/scverse/anndata/issues/656 + def copy(self) -> sparse.csr_array: + return sparse.csr_array(self).copy() + + +class SparseCSCArrayView(_ViewMixin, sparse.csc_array): + # https://github.com/scverse/anndata/issues/656 + def copy(self) -> sparse.csc_array: + return sparse.csc_array(self).copy() + + class CupySparseCSRView(_ViewMixin, CupyCSRMatrix): def copy(self) -> CupyCSRMatrix: return CupyCSRMatrix(self).copy() @@ -283,13 +295,23 @@ def as_view_df(df, view_args): @as_view.register(sparse.csr_matrix) -def as_view_csr(mtx, view_args): - return SparseCSRView(mtx, view_args=view_args) +def as_view_csr_matrix(mtx, view_args): + return SparseCSRMatrixView(mtx, view_args=view_args) @as_view.register(sparse.csc_matrix) -def as_view_csc(mtx, view_args): - return SparseCSCView(mtx, view_args=view_args) +def as_view_csc_matrix(mtx, view_args): + return SparseCSCMatrixView(mtx, view_args=view_args) + + +@as_view.register(sparse.csr_array) +def as_view_csr_array(mtx, view_args): + return SparseCSRArrayView(mtx, view_args=view_args) + + +@as_view.register(sparse.csc_array) +def as_view_csc_array(mtx, view_args): + return SparseCSCArrayView(mtx, view_args=view_args) @as_view.register(dict) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 5452f7bb6..2822154da 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -514,10 +514,17 @@ def write_sparse_compressed( for store_type, (cls, spec, func) in product( (H5Group, ZarrGroup), [ + # spmatrix (sparse.csr_matrix, IOSpec("csr_matrix", "0.1.0"), write_csr), - (views.SparseCSRView, IOSpec("csr_matrix", "0.1.0"), write_csr), + (views.SparseCSRMatrixView, IOSpec("csr_matrix", "0.1.0"), write_csr), (sparse.csc_matrix, IOSpec("csc_matrix", "0.1.0"), write_csc), - (views.SparseCSCView, IOSpec("csc_matrix", "0.1.0"), write_csc), + (views.SparseCSCMatrixView, IOSpec("csc_matrix", "0.1.0"), write_csc), + # sparray + (sparse.csr_array, IOSpec("csr_matrix", "0.1.0"), write_csr), + (views.SparseCSRArrayView, IOSpec("csr_matrix", "0.1.0"), write_csr), + (sparse.csc_array, IOSpec("csc_matrix", "0.1.0"), write_csc), + (views.SparseCSCArrayView, IOSpec("csc_matrix", "0.1.0"), write_csc), + # cupy spmatrix (CupyCSRMatrix, IOSpec("csr_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csr)), ( views.CupySparseCSRView, diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 39323d73a..56501a7b3 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -15,18 +15,35 @@ import h5py import numpy as np import pandas as pd +import scipy +import scipy.sparse from packaging.version import Version -from scipy.sparse import issparse, spmatrix from .exceptiongroups import add_note # noqa: F401 +############################# +# scipy sparse array comapt # +############################# + + +CAN_USE_SPARSE_ARRAY = Version(scipy.__version__) >= Version("1.11") + +if not CAN_USE_SPARSE_ARRAY: + + class SpArray: + @staticmethod + def __repr__(): + return "mock scipy.sparse.sparray" +else: + SpArray = scipy.sparse.sparray + class Empty: pass Index1D = Union[slice, int, str, np.int64, np.ndarray] -Index = Union[Index1D, tuple[Index1D, Index1D], spmatrix] +Index = Union[Index1D, tuple[Index1D, Index1D], scipy.sparse.spmatrix, SpArray] H5Group = h5py.Group H5Array = h5py.Dataset @@ -305,7 +322,7 @@ def _move_adj_mtx(d): for k in ("distances", "connectivities"): if ( (k in n) - and isinstance(n[k], (spmatrix, np.ndarray)) + and isinstance(n[k], (scipy.sparse.spmatrix, np.ndarray)) and len(n[k].shape) == 2 ): warn( @@ -321,7 +338,7 @@ def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list): for k, v in d.items(): if isinstance(v, Mapping): _find_sparse_matrices(v, n, (*keys, k), paths) - elif isinstance(v, spmatrix) and v.shape == (n, n): + elif scipy.sparse.issparse(v) and v.shape == (n, n): paths.append((*keys, k)) return paths @@ -401,7 +418,7 @@ def _safe_transpose(x): This is a workaround for: https://github.com/scipy/scipy/issues/19161 """ - if isinstance(x, DaskArray) and issparse(x._meta): + if isinstance(x, DaskArray) and scipy.sparse.issparse(x._meta): return _transpose_by_block(x) else: return x.T diff --git a/src/anndata/tests/conftest.py b/src/anndata/tests/conftest.py index e45bf708b..59cab29b6 100644 --- a/src/anndata/tests/conftest.py +++ b/src/anndata/tests/conftest.py @@ -1,14 +1,18 @@ from __future__ import annotations import warnings +from functools import partial +import joblib import pytest +from dask.base import normalize_seq, normalize_token, tokenize +from scipy import sparse -import anndata +import anndata as ad from anndata.tests.helpers import subset_func # noqa: F401 # TODO: Should be done in pyproject.toml, see anndata/conftest.py -warnings.filterwarnings("ignore", category=anndata.OldFormatWarning) +warnings.filterwarnings("ignore", category=ad.OldFormatWarning) # TODO: remove once we extricated test utils and tests collect_ignore = ["helpers.py"] @@ -17,3 +21,48 @@ @pytest.fixture def backing_h5ad(tmp_path): return tmp_path / "test.h5ad" + + +##################### +# Dask tokenization # +##################### +# TODO: Should we be exporting this? + + +# sparray classes don't have tokenize defined yet, see: https://github.com/dask/dask/issues/10375 +def normalize_sparse_matrix(x, attrs): + return ( + type(x).__name__, + normalize_seq(normalize_token(getattr(x, key)) for key in attrs), + ) + + +for cls, attrs in [ + (sparse.dia_array, ("data", "offsets", "shape")), + (sparse.bsr_array, ("data", "indices", "indptr", "blocksize", "shape")), + (sparse.coo_array, ("data", "row", "col", "shape")), + (sparse.csr_array, ("data", "indices", "indptr", "shape")), + (sparse.csc_array, ("data", "indices", "indptr", "shape")), + (sparse.lil_array, ("data", "rows", "shape")), +]: + normalize_token.register(cls, partial(normalize_sparse_matrix, attrs=attrs)) + + +@normalize_token.register(sparse.dok_array) +def normalize_dok_matrix(x): + return type(x).__name__, normalize_token(sorted(x.items())) + + +@normalize_token.register(ad.AnnData) +def tokenize_anndata(adata: ad.AnnData): + res = [] + if adata.X is not None: + res.append(tokenize(adata.X)) + res.extend([tokenize(adata.obs), tokenize(adata.var)]) + for attr in ["obsm", "varm", "obsp", "varp", "layers"]: + elem = getattr(adata, attr) + res.append(tokenize(list(elem.items()))) + res.append(joblib.hash(adata.uns)) + if adata.raw is not None: + res.append(tokenize(adata.raw.to_adata())) + return tuple(res) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 2402ee3fa..e771bd849 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -4,8 +4,10 @@ import re import warnings from collections.abc import Collection, Mapping +from contextlib import contextmanager from functools import partial, singledispatch, wraps from string import ascii_letters +from typing import Literal import h5py import numpy as np @@ -19,12 +21,14 @@ from anndata._core.sparse_dataset import BaseCompressedSparseDataset from anndata._core.views import ArrayView from anndata.compat import ( + CAN_USE_SPARSE_ARRAY, AwkArray, CupyArray, CupyCSCMatrix, CupyCSRMatrix, CupySparseMatrix, DaskArray, + SpArray, ) from anndata.utils import asarray @@ -36,9 +40,29 @@ pd.DataFrame, DaskArray, ), - varm_types=(sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray), - layers_types=(sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray), + varm_types=( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + DaskArray, + ), + layers_types=( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + DaskArray, + ), ) +if CAN_USE_SPARSE_ARRAY: + GEN_ADATA_DASK_ARGS["obsm_types"] = GEN_ADATA_DASK_ARGS["obsm_types"] + ( + sparse.csr_array, + ) + GEN_ADATA_DASK_ARGS["varm_types"] = GEN_ADATA_DASK_ARGS["varm_types"] + ( + sparse.csr_array, + ) + GEN_ADATA_DASK_ARGS["layers_types"] = GEN_ADATA_DASK_ARGS["layers_types"] + ( + sparse.csr_array, + ) def gen_vstr_recarray(m, n, dtype=None): @@ -152,6 +176,30 @@ def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: return df +default_key_types = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, +) +if CAN_USE_SPARSE_ARRAY: + default_key_types = default_key_types + (sparse.csr_array,) + + +def maybe_add_sparse_array( + mapping: Mapping, + types: Collection[type], + format: Literal["csr", "csc"], + random_state: int, + shape: tuple[int, int], +): + if CAN_USE_SPARSE_ARRAY: + if sparse.csr_array in types or sparse.csr_matrix in types: + mapping["sparse_array"] = sparse.csr_array( + sparse.random(*shape, format=format, random_state=random_state) + ) + return mapping + + # TODO: Use hypothesis for this? def gen_adata( shape: tuple[int, int], @@ -159,19 +207,10 @@ def gen_adata( X_dtype=np.float32, # obs_dtypes, # var_dtypes, - obsm_types: Collection[type] = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, - AwkArray, - ), - varm_types: Collection[type] = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, - AwkArray, - ), - layers_types: Collection[type] = (sparse.csr_matrix, np.ndarray, pd.DataFrame), + obsm_types: Collection[type] = default_key_types + (AwkArray,), + varm_types: Collection[type] = default_key_types + (AwkArray,), + layers_types: Collection[type] = default_key_types, + random_state=None, sparse_fmt: str = "csr", ) -> AnnData: """\ @@ -202,6 +241,9 @@ def gen_adata( """ import dask.array as da + if random_state is None: + random_state = np.random.default_rng() + M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) @@ -214,35 +256,67 @@ def gen_adata( if X_type is None: X = None else: - X = X_type(np.random.binomial(100, 0.005, (M, N)).astype(X_dtype)) + X = X_type(random_state.binomial(100, 0.005, (M, N)).astype(X_dtype)) + obsm = dict( array=np.random.random((M, 50)), - sparse=sparse.random(M, 100, format=sparse_fmt), + sparse=sparse.random(M, 100, format=sparse_fmt, random_state=random_state), df=gen_typed_df(M, obs_names), awk_2d_ragged=gen_awkward((M, None)), da=da.random.random((M, 50)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} + obsm = maybe_add_sparse_array( + mapping=obsm, + types=obsm_types, + format=sparse_fmt, + random_state=random_state, + shape=(M, 100), + ) varm = dict( array=np.random.random((N, 50)), - sparse=sparse.random(N, 100, format=sparse_fmt), + sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state), df=gen_typed_df(N, var_names), awk_2d_ragged=gen_awkward((N, None)), da=da.random.random((N, 50)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} + varm = maybe_add_sparse_array( + mapping=varm, + types=varm_types, + format=sparse_fmt, + random_state=random_state, + shape=(N, 100), + ) layers = dict( array=np.random.random((M, N)), - sparse=sparse.random(M, N, format=sparse_fmt), + sparse=sparse.random(M, N, format=sparse_fmt, random_state=random_state), da=da.random.random((M, N)), ) + layers = maybe_add_sparse_array( + mapping=layers, + types=layers_types, + format=sparse_fmt, + random_state=random_state, + shape=(M, N), + ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( - array=np.random.random((M, M)), sparse=sparse.random(M, M, format=sparse_fmt) + array=np.random.random((M, M)), + sparse=sparse.random(M, M, format=sparse_fmt, random_state=random_state), ) + if CAN_USE_SPARSE_ARRAY: + obsp["sparse_array"] = sparse.csr_array( + sparse.random(M, M, format=sparse_fmt, random_state=random_state) + ) varp = dict( - array=np.random.random((N, N)), sparse=sparse.random(N, N, format=sparse_fmt) + array=np.random.random((N, N)), + sparse=sparse.random(N, N, format=sparse_fmt, random_state=random_state), ) + if CAN_USE_SPARSE_ARRAY: + varp["sparse_array"] = sparse.csr_array( + sparse.random(N, N, format=sparse_fmt, random_state=random_state) + ) uns = dict( O_recarray=gen_vstr_recarray(N, 5), nested=dict( @@ -301,6 +375,12 @@ def spmatrix_bool_subset(index, min_size=2): ) +def sparray_bool_subset(index, min_size=2): + return sparse.csr_array( + array_bool_subset(index, min_size=min_size).reshape(len(index), 1) + ) + + def array_subset(index, min_size=2): if len(index) < min_size: raise ValueError( @@ -351,6 +431,7 @@ def single_subset(index): list_bool_subset, matrix_bool_subset, spmatrix_bool_subset, + sparray_bool_subset, ] ) def subset_func(request): @@ -442,6 +523,11 @@ def assert_equal_sparse(a, b, exact=False, elem_name=None): assert_equal(b, a, exact, elem_name=elem_name) +@assert_equal.register(SpArray) +def assert_equal_sparse_array(a, b, exact=False, elem_name=None): + return assert_equal_sparse(a, b, exact, elem_name) + + @assert_equal.register(CupySparseMatrix) def assert_equal_cupy_sparse(a, b, exact=False, elem_name=None): a = a.toarray() @@ -635,11 +721,33 @@ def _(a): return da.from_array(a, _half_chunk_size(a.shape)) +@as_sparse_dask_array.register(SpArray) +def _(a): + import dask.array as da + + return da.from_array(sparse.csr_matrix(a), _half_chunk_size(a.shape)) + + @as_sparse_dask_array.register(DaskArray) def _(a): return a.map_blocks(sparse.csr_matrix) +@contextmanager +def pytest_8_raises(exc_cls, *, match: str | re.Pattern = None): + """Error handling using pytest 8's support for __notes__. + + See: https://github.com/pytest-dev/pytest/pull/11227 + + Remove once pytest 8 is out! + """ + + with pytest.raises(exc_cls) as exc_info: + yield exc_info + + check_error_or_notes_match(exc_info, match) + + def check_error_or_notes_match(e: pytest.ExceptionInfo, pattern: str | re.Pattern): """ Checks whether the printed error message or the notes contains the given pattern. @@ -719,6 +827,8 @@ def shares_memory_sparse(x, y): pytest.param(asarray, id="np_array"), pytest.param(sparse.csr_matrix, id="scipy_csr"), pytest.param(sparse.csc_matrix, id="scipy_csc"), + pytest.param(sparse.csr_array, id="scipy_csr_array"), + pytest.param(sparse.csc_array, id="scipy_csc_array"), ] DASK_MATRIX_PARAMS = [ diff --git a/src/anndata/tests/test_backed_sparse.py b/src/anndata/tests/test_backed_sparse.py index 8ef07788c..0f7a23291 100644 --- a/src/anndata/tests/test_backed_sparse.py +++ b/src/anndata/tests/test_backed_sparse.py @@ -12,6 +12,7 @@ import anndata as ad from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset +from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray from anndata.experimental import read_dispatched, write_elem from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func @@ -234,6 +235,8 @@ def test_consecutive_bool( [ pytest.param(sparse.csr_matrix, sparse.vstack), pytest.param(sparse.csc_matrix, sparse.hstack), + pytest.param(sparse.csr_array, sparse.vstack), + pytest.param(sparse.csc_array, sparse.hstack), ], ) def test_dataset_append_memory( @@ -477,12 +480,15 @@ def test_backed_sizeof( pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) -def test_append_overflow_check(group_fn, tmpdir): +@pytest.mark.parametrize("sparse_class", [sparse.csr_matrix, sparse.csr_array]) +def test_append_overflow_check(group_fn, sparse_class, tmpdir): + if CAN_USE_SPARSE_ARRAY and issubclass(sparse_class, SpArray): + pytest.skip("scipy bug causes view to be allocated") group = group_fn(tmpdir) typemax_int32 = np.iinfo(np.int32).max - orig_mtx = sparse.csr_matrix(np.ones((1, 1), dtype=bool)) + orig_mtx = sparse_class(np.ones((1, 1), dtype=bool)) # Minimally allocating new matrix - new_mtx = sparse.csr_matrix( + new_mtx = sparse_class( ( np.broadcast_to(True, typemax_int32 - 1), np.broadcast_to(np.int32(1), typemax_int32 - 1), diff --git a/src/anndata/tests/test_base.py b/src/anndata/tests/test_base.py index 5f817bb37..2db3d9d99 100644 --- a/src/anndata/tests/test_base.py +++ b/src/anndata/tests/test_base.py @@ -13,6 +13,7 @@ from anndata import AnnData from anndata._settings import settings +from anndata.compat import CAN_USE_SPARSE_ARRAY from anndata.tests.helpers import assert_equal, gen_adata # some test objects that we use below @@ -30,6 +31,8 @@ def test_creation(): AnnData(np.array([[1, 2], [3, 4]]), {}, {}) AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0])) AnnData(sp.eye(2)) + if CAN_USE_SPARSE_ARRAY: + AnnData(sp.eye_array(2)) X = np.array([[1, 2, 3], [4, 5, 6]]) adata = AnnData( X=X, diff --git a/src/anndata/tests/test_concatenate.py b/src/anndata/tests/test_concatenate.py index 80714d44a..28af38ad3 100644 --- a/src/anndata/tests/test_concatenate.py +++ b/src/anndata/tests/test_concatenate.py @@ -19,7 +19,12 @@ from anndata import AnnData, Raw, concat from anndata._core import merge from anndata._core.index import _subset -from anndata.compat import AwkArray, DaskArray +from anndata.compat import ( + AwkArray, + CupySparseMatrix, + DaskArray, + SpArray, +) from anndata.tests import helpers from anndata.tests.helpers import ( BASE_MATRIX_PARAMS, @@ -63,6 +68,11 @@ def _filled_sparse(a, fill_value=None): return sparse.csr_matrix(np.broadcast_to(fill_value, a.shape)) +@filled_like.register(SpArray) +def _filled_sparse_array(a, fill_value=None): + return sparse.csr_array(filled_like(sparse.csr_matrix(a))) + + @filled_like.register(pd.DataFrame) def _filled_df(a, fill_value=np.nan): # dtype from pd.concat can be unintuitive, this returns something close enough @@ -89,6 +99,11 @@ def array_type(request): return request.param +@pytest.fixture(params=BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS) +def cpu_array_type(request): + return request.param + + @pytest.fixture(params=["inner", "outer"]) def join_type(request): return request.param @@ -179,6 +194,15 @@ def test_concatenate_roundtrip(join_type, array_type, concat_func, backwards_com ) assert_equal(result[orig.obs_names].copy(), orig) + base_type = type(orig.X) + if sparse.issparse(orig.X): + if isinstance(orig.X, SpArray): + base_type = SpArray + else: + base_type = sparse.spmatrix + if isinstance(orig.X, CupySparseMatrix): + base_type = CupySparseMatrix + assert isinstance(result.X, base_type) @mark_legacy_concatenate @@ -1137,7 +1161,7 @@ def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val assert_equal(a, b) -def test_batch_key(axis, array_type): +def test_batch_key(axis): """Test that concat only adds a label if the key is provided""" def get_annot(adata): @@ -1439,15 +1463,32 @@ def test_concat_null_X(): # https://github.com/scverse/ehrapy/issues/151#issuecomment-1016753744 -def test_concat_X_dtype(): - adatas_orig = {k: AnnData(np.ones((20, 10), dtype=np.int8)) for k in list("abc")} +@pytest.mark.parametrize("sparse_indexer_type", [np.int64, np.int32]) +def test_concat_X_dtype(cpu_array_type, sparse_indexer_type): + adatas_orig = { + k: AnnData(cpu_array_type(np.ones((20, 10), dtype=np.int8))) + for k in list("abc") + } for adata in adatas_orig.values(): - adata.raw = AnnData(np.ones((20, 30), dtype=np.float64)) + adata.raw = AnnData(cpu_array_type(np.ones((20, 30), dtype=np.float64))) + if sparse.issparse(adata.X): + adata.X.indptr = adata.X.indptr.astype(sparse_indexer_type) + adata.X.indices = adata.X.indices.astype(sparse_indexer_type) result = concat(adatas_orig, index_unique="-") assert result.X.dtype == np.int8 assert result.raw.X.dtype == np.float64 + if sparse.issparse(result.X): + # See https://github.com/scipy/scipy/issues/20389 for why this doesn't work with csc + if sparse_indexer_type == np.int64 and ( + issubclass(cpu_array_type, sparse.spmatrix) or adata.X.format == "csc" + ): + pytest.xfail( + "Data type int64 is not maintained for sparse matrices or csc array" + ) + assert result.X.indptr.dtype == sparse_indexer_type + assert result.X.indices.dtype == sparse_indexer_type # Leaving out for now. See definition of these values for explanation diff --git a/src/anndata/tests/test_concatenate_disk.py b/src/anndata/tests/test_concatenate_disk.py index 659fb98cf..1ccecc7b5 100644 --- a/src/anndata/tests/test_concatenate_disk.py +++ b/src/anndata/tests/test_concatenate_disk.py @@ -33,7 +33,7 @@ def axis(request): @pytest.fixture( - params=["array", "sparse"], + params=["array", "sparse", "sparse_array"], ) def array_type(request): return request.param @@ -96,6 +96,10 @@ def get_array_type(array_type, axis): if axis == 0: return sparse.csr_matrix return sparse.csc_matrix + if array_type == "sparse_array": + if axis == 0: + return sparse.csr_array + return sparse.csc_array if array_type == "array": return asarray else: diff --git a/src/anndata/tests/test_hdf5_backing.py b/src/anndata/tests/test_hdf5_backing.py index f7791de62..328185e97 100644 --- a/src/anndata/tests/test_hdf5_backing.py +++ b/src/anndata/tests/test_hdf5_backing.py @@ -8,6 +8,7 @@ from scipy import sparse import anndata as ad +from anndata.compat import SpArray from anndata.tests.helpers import ( GEN_ADATA_DASK_ARGS, as_dense_dask_array, @@ -182,6 +183,7 @@ def test_backed_raw(tmp_path): [ pytest.param(asarray, id="dense_array"), pytest.param(sparse.csr_matrix, id="csr_matrix"), + pytest.param(sparse.csr_array, id="csr_array"), ], ) def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): @@ -193,8 +195,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray - and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix)) - and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix)) + and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) + and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" diff --git a/src/anndata/tests/test_inplace_subset.py b/src/anndata/tests/test_inplace_subset.py index 110d2574a..5534ff9c7 100644 --- a/src/anndata/tests/test_inplace_subset.py +++ b/src/anndata/tests/test_inplace_subset.py @@ -13,8 +13,22 @@ @pytest.fixture( - params=[np.array, sparse.csr_matrix, sparse.csc_matrix, as_dense_dask_array], - ids=["np_array", "scipy_csr", "scipy_csc", "dask_array"], + params=[ + np.array, + sparse.csr_matrix, + sparse.csc_matrix, + sparse.csr_array, + sparse.csc_array, + as_dense_dask_array, + ], + ids=[ + "np_array", + "scipy_csr", + "scipy_csc", + "scipy_csr_array", + "scipy_csc_array", + "dask_array", + ], ) def matrix_type(request): return request.param diff --git a/src/anndata/tests/test_io_dispatched.py b/src/anndata/tests/test_io_dispatched.py index 511179d4c..564a2e788 100644 --- a/src/anndata/tests/test_io_dispatched.py +++ b/src/anndata/tests/test_io_dispatched.py @@ -7,6 +7,7 @@ from scipy import sparse import anndata as ad +from anndata.compat import SpArray from anndata.experimental import ( read_dispatched, read_elem, @@ -100,7 +101,7 @@ def set_copy(d, **kwargs): # TODO: Should the passed path be absolute? path = "/" + store.path + "/" + k if hasattr(elem, "shape") and not isinstance( - elem, (sparse.spmatrix, ad.AnnData) + elem, (sparse.spmatrix, SpArray, ad.AnnData) ): if re.match(r"^/((X)|(layers)).*", path): chunks = (M, N) diff --git a/src/anndata/tests/test_readwrite.py b/src/anndata/tests/test_readwrite.py index 0c87eae6e..1c2630f98 100644 --- a/src/anndata/tests/test_readwrite.py +++ b/src/anndata/tests/test_readwrite.py @@ -14,11 +14,12 @@ import pytest import zarr from numba.core.errors import NumbaDeprecationWarning -from scipy.sparse import csc_matrix, csr_matrix +from scipy import sparse +from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix import anndata as ad from anndata._io.specs.registry import IORegistryError -from anndata.compat import DaskArray, _read_attr +from anndata.compat import DaskArray, SpArray, _read_attr from anndata.tests.helpers import ( as_dense_dask_array, assert_equal, @@ -105,7 +106,7 @@ def dtype(request): # ------------------------------------------------------------------------------ -@pytest.mark.parametrize("typ", [np.array, csr_matrix, as_dense_dask_array]) +@pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): tmpdir = Path(tmp_path) pth1 = tmpdir / f"first.{diskfmt}" @@ -130,7 +131,7 @@ def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): @pytest.mark.parametrize("storage", ["h5ad", pytest.param("zarr", marks=[needs_zarr])]) -@pytest.mark.parametrize("typ", [np.array, csr_matrix, as_dense_dask_array]) +@pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwargs): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) @@ -160,7 +161,10 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa # either load as same type or load the convert DaskArray to array # since we tested if assigned types and loaded types are DaskArray # this would also work if they work - assert isinstance(adata_src.raw.X, (type(adata.raw.X), DaskArray)) + if isinstance(adata_src.raw.X, SpArray): + assert isinstance(adata.raw.X, sparse.spmatrix) + else: + assert isinstance(adata_src.raw.X, (type(adata.raw.X), DaskArray)) assert isinstance( adata_src.uns["uns4"]["c"], (type(adata.uns["uns4"]["c"]), DaskArray) ) @@ -173,7 +177,7 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa assert_equal(adata, adata_src) -@pytest.mark.parametrize("typ", [np.array, csr_matrix, as_dense_dask_array]) +@pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_maintain_X_dtype(typ, backing_h5ad): X = typ(X_list).astype("int8") adata_src = ad.AnnData(X) @@ -206,7 +210,7 @@ def test_maintain_layers(rw): assert not np.any((orig.layers["sparse"] != curr.layers["sparse"]).toarray()) -@pytest.mark.parametrize("typ", [np.array, csr_matrix, as_dense_dask_array]) +@pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_h5ad_one_dimension(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) @@ -217,7 +221,7 @@ def test_readwrite_h5ad_one_dimension(typ, backing_h5ad): assert_equal(adata, adata_one) -@pytest.mark.parametrize("typ", [np.array, csr_matrix, as_dense_dask_array]) +@pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_backed(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) @@ -232,7 +236,9 @@ def test_readwrite_backed(typ, backing_h5ad): assert_equal(adata, adata_src) -@pytest.mark.parametrize("typ", [np.array, csr_matrix, csc_matrix]) +@pytest.mark.parametrize( + "typ", [np.array, csr_matrix, csc_matrix, csr_array, csc_array] +) def test_readwrite_equivalent_h5ad_zarr(tmp_path, typ): h5ad_pth = tmp_path / "adata.h5ad" zarr_pth = tmp_path / "adata.zarr" diff --git a/src/anndata/tests/test_views.py b/src/anndata/tests/test_views.py index 7ac4cfefc..00e4d3425 100644 --- a/src/anndata/tests/test_views.py +++ b/src/anndata/tests/test_views.py @@ -7,13 +7,19 @@ import numpy as np import pandas as pd import pytest -from dask.base import normalize_token, tokenize +from dask.base import tokenize from scipy import sparse import anndata as ad from anndata._core.index import _normalize_index -from anndata._core.views import ArrayView, SparseCSCView, SparseCSRView -from anndata.compat import CupyCSCMatrix, DaskArray +from anndata._core.views import ( + ArrayView, + SparseCSCArrayView, + SparseCSCMatrixView, + SparseCSRArrayView, + SparseCSRMatrixView, +) +from anndata.compat import CAN_USE_SPARSE_ARRAY, CupyCSCMatrix, DaskArray from anndata.tests.helpers import ( BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, @@ -122,7 +128,7 @@ def test_view_subset_shapes(): assert {k: v.shape[0] for k, v in view.varm.items()} == {k: 5 for k in view.varm} -def test_modify_view_component(matrix_type, mapping_name): +def test_modify_view_component(matrix_type, mapping_name, request): adata = ad.AnnData( np.zeros((10, 10)), **{mapping_name: dict(m=matrix_type(asarray(sparse.random(10, 10))))}, @@ -146,6 +152,9 @@ def test_modify_view_component(matrix_type, mapping_name): assert init_hash == hash_func(adata) + if "sparse_array_dask_array" in request.node.callspec.id and CAN_USE_SPARSE_ARRAY: + assert False # sparse arrays in dask are general expected to fail but in this case they do not + @pytest.mark.parametrize("attr", ["obsm", "varm"]) def test_set_obsm_key(adata, attr): @@ -296,21 +305,7 @@ def test_not_set_subset_X(matrix_type_base, subset_func): assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == joblib.hash(adata) - - -@normalize_token.register(ad.AnnData) -def tokenize_anndata(adata: ad.AnnData): - res = [] - if adata.X is not None: - res.append(tokenize(adata.X)) - res.extend([tokenize(adata.obs), tokenize(adata.var)]) - for attr in ["obsm", "varm", "obsp", "varp", "layers"]: - elem = getattr(adata, attr) - res.append(tokenize(list(elem.items()))) - res.append(joblib.hash(adata.uns)) - if adata.raw is not None: - res.append(tokenize(adata.raw.to_adata())) - return tuple(res) + assert isinstance(subset.X, type(adata.X)) # TODO: Determine if this is the intended behavior, @@ -338,6 +333,7 @@ def test_not_set_subset_X_dask(matrix_type_no_gpu, subset_func): assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == tokenize(adata) + assert isinstance(subset.X, type(adata.X)) @IGNORE_SPARSE_EFFICIENCY_WARNING @@ -510,19 +506,24 @@ def test_view_of_view(matrix_type, subset_func, subset_func2): pytest.xfail("Other subset generating functions have trouble with this") var_s1 = subset_func(adata.var_names, min_size=4) var_view1 = adata[:, var_s1] + adata[:, var_s1].X var_s2 = subset_func2(var_view1.var_names) var_view2 = var_view1[:, var_s2] assert var_view2._adata_ref is adata + assert isinstance(var_view2.X, type(adata.X)) obs_s1 = subset_func(adata.obs_names, min_size=4) obs_view1 = adata[obs_s1, :] obs_s2 = subset_func2(obs_view1.obs_names) assert adata[obs_s1, :][:, var_s1][obs_s2, :]._adata_ref is adata + assert isinstance(obs_view1.X, type(adata.X)) view_of_actual_copy = adata[:, var_s1].copy()[obs_s1, :].copy()[:, var_s2].copy() view_of_view_copy = adata[:, var_s1][obs_s1, :][:, var_s2].copy() assert_equal(view_of_actual_copy, view_of_view_copy, exact=True) + assert isinstance(view_of_actual_copy.X, type(adata.X)) + assert isinstance(view_of_view_copy.X, type(adata.X)) def test_view_of_view_modification(): @@ -644,7 +645,12 @@ def test_viewness_propagation_allclose(adata): assert np.allclose(a.varm["o"], b.varm["o"].copy(), equal_nan=True) -@pytest.mark.parametrize("spmat", [sparse.csr_matrix, sparse.csc_matrix]) +spmat = [sparse.csr_matrix, sparse.csc_matrix] +if CAN_USE_SPARSE_ARRAY: + spmat += [sparse.csr_array, sparse.csc_array] + + +@pytest.mark.parametrize("spmat", spmat) def test_deepcopy_subset(adata, spmat: type): adata.obsp["arr"] = np.zeros((adata.n_obs, adata.n_obs)) adata.obsp["spmat"] = spmat((adata.n_obs, adata.n_obs)) @@ -656,15 +662,27 @@ def test_deepcopy_subset(adata, spmat: type): np.testing.assert_array_equal(adata.obsp["arr"].shape, (10, 10)) assert isinstance(adata.obsp["spmat"], spmat) + view_type = ( + SparseCSRMatrixView if spmat is sparse.csr_matrix else SparseCSCMatrixView + ) + if CAN_USE_SPARSE_ARRAY: + view_type = ( + SparseCSRArrayView if spmat is sparse.csr_array else SparseCSCArrayView + ) assert not isinstance( adata.obsp["spmat"], - SparseCSRView if spmat is sparse.csr_matrix else SparseCSCView, + view_type, ) np.testing.assert_array_equal(adata.obsp["spmat"].shape, (10, 10)) +array_type = [asarray, sparse.csr_matrix, sparse.csc_matrix] +if CAN_USE_SPARSE_ARRAY: + array_type += [sparse.csr_array, sparse.csc_array] + + # https://github.com/scverse/anndata/issues/680 -@pytest.mark.parametrize("array_type", [asarray, sparse.csr_matrix, sparse.csc_matrix]) +@pytest.mark.parametrize("array_type", array_type) @pytest.mark.parametrize("attr", ["X", "layers", "obsm", "varm", "obsp", "varp"]) def test_view_mixin_copies_data(adata, array_type: type, attr): N = 100 diff --git a/src/anndata/tests/test_x.py b/src/anndata/tests/test_x.py index fd8e2f613..2d41aaee2 100644 --- a/src/anndata/tests/test_x.py +++ b/src/anndata/tests/test_x.py @@ -15,6 +15,8 @@ UNLABELLED_ARRAY_TYPES = [ pytest.param(sparse.csr_matrix, id="csr"), pytest.param(sparse.csc_matrix, id="csc"), + pytest.param(sparse.csr_array, id="csr_array"), + pytest.param(sparse.csc_array, id="csc_array"), pytest.param(asarray, id="ndarray"), ] SINGULAR_SHAPES = [ @@ -33,8 +35,25 @@ def diskfmt(request): def test_setter_singular_dim(shape, orig_array_type, new_array_type): # https://github.com/scverse/anndata/issues/500 adata = gen_adata(shape, X_type=orig_array_type) - adata.X = new_array_type(np.ones(shape)) + to_assign = new_array_type(np.ones(shape)) + adata.X = to_assign np.testing.assert_equal(asarray(adata.X), 1) + assert isinstance(adata.X, type(to_assign)) + + +@pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES) +@pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES) +def test_setter_view(orig_array_type, new_array_type): + adata = gen_adata((10, 10), X_type=orig_array_type) + orig_X = adata.X + to_assign = new_array_type(np.ones((9, 9))) + if isinstance(orig_X, np.ndarray) and sparse.issparse(to_assign): + # https://github.com/scverse/anndata/issues/500 + pytest.xfail("Cannot set a dense array with a sparse array") + view = adata[:9, :9] + view.X = to_assign + np.testing.assert_equal(asarray(view.X), np.ones((9, 9))) + assert isinstance(view.X, type(orig_X)) ############################### diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 9c700e28b..a26affd6d 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -11,7 +11,7 @@ from scipy import sparse from ._core.sparse_dataset import BaseCompressedSparseDataset -from .compat import CupyArray, CupySparseMatrix, DaskArray +from .compat import CupyArray, CupySparseMatrix, DaskArray, SpArray from .logging import get_logger if TYPE_CHECKING: @@ -44,6 +44,7 @@ def asarray(x): return np.asarray(x) +@asarray.register(SpArray) @asarray.register(sparse.spmatrix) def asarray_sparse(x): return x.toarray()