From 672390f15322133442854fdfb3ae3a931623ecf8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 20 Nov 2023 18:37:25 +0100 Subject: [PATCH 001/348] (feat): first steps, gets up to in-memory in notebook --- .gitignore | 1 + anndata/_core/anndata.py | 41 ++++- anndata/experimental/__init__.py | 2 + anndata/experimental/backed/__init__.py | 0 anndata/experimental/backed/_io.py | 124 ++++++++++++++ anndata/experimental/backed/_lazy_arrays.py | 176 ++++++++++++++++++++ anndata/experimental/backed/_xarray.py | 48 ++++++ 7 files changed, 384 insertions(+), 8 deletions(-) create mode 100644 anndata/experimental/backed/__init__.py create mode 100644 anndata/experimental/backed/_io.py create mode 100644 anndata/experimental/backed/_lazy_arrays.py create mode 100644 anndata/experimental/backed/_xarray.py diff --git a/.gitignore b/.gitignore index dded609a6..3f66dbd99 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ /*cache/ /data/ +/venv/ # Distribution / packaging /dist/ diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index b37e33177..af6d582da 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -55,7 +55,6 @@ from .sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from .views import ( ArrayView, - DataFrameView, DictView, _resolve_idxs, as_view, @@ -354,6 +353,20 @@ def __init__( oidx: Index1D = None, vidx: Index1D = None, ): + if "Dataset2D" in str(type(obs)): + from ..experimental.backed._xarray import Dataset2D + + @_gen_dataframe.register(Dataset2D) + def _gen_dataframe_xr( + anno: Dataset2D, + index_names: Iterable[str], + *, + source: Literal["X", "shape"], + attr: Literal["obs", "var"], + length: int | None = None, + ): + return anno + if asview: if not isinstance(X, AnnData): raise ValueError("`X` has to be an AnnData object.") @@ -413,11 +426,11 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): self._varp = adata_ref.varp._view(self, vidx) # fix categories uns = copy(adata_ref._uns) - self._remove_unused_categories(adata_ref.obs, obs_sub, uns) - self._remove_unused_categories(adata_ref.var, var_sub, uns) + # self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + # self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes - self._obs = DataFrameView(obs_sub, view_args=(self, "obs")) - self._var = DataFrameView(var_sub, view_args=(self, "var")) + self._obs = as_view(obs_sub, view_args=(self, "obs")) + self._var = as_view(var_sub, view_args=(self, "var")) self._uns = uns # set data @@ -571,7 +584,7 @@ def _init_as_actual( _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() - self._check_uniqueness() + # self._check_uniqueness() if self.filename: assert not isinstance( @@ -926,7 +939,13 @@ def obs(self): @property def obs_names(self) -> pd.Index: """Names of observations (alias for `.obs.index`).""" - return self.obs.index + if hasattr(self.obs, "index"): + return self.obs.index + return pd.Index( + self.obs["obs_names"].data.compute() + if isinstance(self.obs["obs_names"].data, DaskArray) + else self.obs["obs_names"].data + ) @obs_names.setter def obs_names(self, names: Sequence[str]): @@ -949,7 +968,13 @@ def var(self): @property def var_names(self) -> pd.Index: """Names of variables (alias for `.var.index`).""" - return self.var.index + if hasattr(self.var, "index"): + return self.var.index + return pd.Index( + self.var["var_names"].data.compute() + if isinstance(self.var["var_names"].data, DaskArray) + else self.var["var_names"].data + ) @var_names.setter def var_names(self, names: Sequence[str]): diff --git a/anndata/experimental/__init__.py b/anndata/experimental/__init__.py index 486f14e8d..f189196da 100644 --- a/anndata/experimental/__init__.py +++ b/anndata/experimental/__init__.py @@ -4,6 +4,7 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from ._dispatch_io import read_dispatched, write_dispatched +from .backed import read_backed from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader @@ -20,4 +21,5 @@ "sparse_dataset", "CSRDataset", "CSCDataset", + "read_backed", ] diff --git a/anndata/experimental/backed/__init__.py b/anndata/experimental/backed/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py new file mode 100644 index 000000000..99420a6a9 --- /dev/null +++ b/anndata/experimental/backed/_io.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from collections.abc import MutableMapping + +import dask.array as da +import h5py +import xarray as xr +import zarr + +from ..._core.anndata import AnnData +from ..._core.sparse_dataset import sparse_dataset +from ...compat import DaskArray +from .. import read_dispatched +from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray +from ._xarray import Dataset2D + + +def read_backed( + store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, +) -> AnnData: + """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. + No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. + + Args: + store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best + for it to be consolidated. + + Returns: + AnnData: A lazily read-in AnnData object. + """ + is_h5 = False + if isinstance(store, Path) or isinstance(store, str): + store = str(store) + if store.endswith("h5ad"): + is_h5 = True + + has_keys = True # true if consolidated or h5ad + if not is_h5: + try: + f = zarr.open_consolidated(store, mode="r") + except KeyError: + has_keys = False + f = zarr.open(store, mode="r") + else: + f = h5py.File(store, mode="r") + + def callback(func, elem_name: str, elem, iospec): + if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] + iter_object = ( + elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] + ) + return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) + elif elem_name.startswith("/raw"): + return None + elif iospec.encoding_type in {"dataframe"}: + iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ + (elem.attrs["_index"], elem[elem.attrs["_index"]]) + ] + d = {k: read_dispatched(v, callback) for k, v in iter_object} + d_with_xr = {} + index_label = f'{elem_name.replace("/", "")}_names' + for k in d: + v = d[k] + if type(v) == DaskArray and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray( + v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k + ) + elif ( + type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray + ) and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray( + xr.core.indexing.LazilyIndexedArray(v), + coords=[d[elem.attrs["_index"]]], + dims=[index_label], + name=k, + ) + elif k == elem.attrs["_index"]: + d_with_xr[index_label] = xr.DataArray( + v, coords=[v], dims=[index_label], name=index_label + ) + else: + d_with_xr[k] = v + return Dataset2D(d_with_xr) + elif iospec.encoding_type == "categorical": + drop_unused_cats = not ( + elem_name.startswith("/obsm") or elem_name.startswith("/varm") + ) + return LazyCategoricalArray( + elem["codes"], elem["categories"], elem.attrs, drop_unused_cats + ) + elif "nullable" in iospec.encoding_type: + return LazyMaskedArray( + elem["values"], + elem["mask"] if "mask" in elem else None, + iospec.encoding_type, + ) + elif iospec.encoding_type in {"array", "string-array"}: + if is_h5: + if iospec.encoding_type == "string-array": + if ( + "read_dataset" not in dir() + ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 + from ..._io.h5ad import read_dataset + elem = read_dataset(elem) + if not hasattr(elem, "chunks") or elem.chunks is None: + return da.from_array(elem, chunks=(1000,) * len(elem.shape)) + return da.from_array(elem) + return da.from_zarr(elem) + elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: + return sparse_dataset(elem) + elif iospec.encoding_type in {"awkward-array"}: + return read_dispatched(elem, None) + return func(elem) + + adata = read_dispatched(f, callback=callback) + + return adata diff --git a/anndata/experimental/backed/_lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py new file mode 100644 index 000000000..59236fd75 --- /dev/null +++ b/anndata/experimental/backed/_lazy_arrays.py @@ -0,0 +1,176 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import xarray as xr +from xarray.core.indexing import ( + BasicIndexer, + ExplicitlyIndexedNDArrayMixin, + OuterIndexer, +) + +from anndata._core.index import Index, _subset +from anndata._core.views import as_view +from anndata.compat import ZarrArray + + +class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): + def __eq__(self, __o) -> np.ndarray: + return self[...] == __o + + def __ne__(self, __o) -> np.ndarray: + return ~(self == __o) + + @property + def shape(self) -> tuple[int, ...]: + """Shape of this array + + Returns: + Tuple[int, ...]: A shape that looks like a 1-d shape i.e., (#, ) + """ + return self.values.shape + + +class LazyCategoricalArray(MaskedArrayMixIn): + __slots__ = ( + "values", + "attrs", + "_categories", + "_categories_cache", + "group", + "_drop_unused_cats", + ) + + def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs): + """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. + + Args: + codes (Union[zarr.Array, h5py.Dataset]): values (integers) of the array, one for each element + categories (Union[zarr.Array, h5py.Dataset]): mappings from values to strings + attrs (Union[zarr.Array, h5py.Dataset]): attrs containing boolean "ordered" + _drop_unused_cats (bool): Whether or not to drop unused categories. + """ + self.values = codes + self._categories = categories + self._categories_cache = None + self.attrs = dict(attrs) + self._drop_unused_cats = _drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? + + @property + def categories(self): # __slots__ and cached_property are incompatible + if self._categories_cache is None: + if isinstance(self._categories, ZarrArray): + self._categories_cache = self._categories[...] + else: + if ( + "read_dataset" not in dir() + ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 + from ..._io.h5ad import read_dataset + self._categories_cache = read_dataset(self._categories) + return self._categories_cache + + @property + def dtype(self) -> pd.CategoricalDtype: + return pd.CategoricalDtype(self.categories, self.ordered) + + @property + def ordered(self): + return bool(self.attrs["ordered"]) + + def __getitem__(self, selection) -> pd.Categorical: + idx = selection + if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): + idx = selection.tuple[0] # need to better understand this + if isinstance(self.values, ZarrArray): + codes = self.values.oindex[idx] + else: + codes = self.values[idx] + if codes.shape == (): # handle 0d case + codes = np.array([codes]) + res = pd.Categorical.from_codes( + codes=codes, + categories=self.categories, + ordered=self.ordered, + ) + if self._drop_unused_cats: + return res.remove_unused_categories() + return res + + def __repr__(self) -> str: + return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" + + def copy(self) -> LazyCategoricalArray: + """Returns a copy of this array which can then be safely edited + + Returns: + LazyCategoricalArray: copied LazyCategoricalArray + """ + arr = LazyCategoricalArray( + self.values, self._categories, self.attrs + ) # self.categories reads in data + return arr + + +class LazyMaskedArray(MaskedArrayMixIn): + __slots__ = ("mask", "values", "_dtype_str") + + def __init__(self, values, mask, dtype_str, *args, **kwargs): + """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. + + Args: + values (Union[zarr.Array, h5py.Dataset]): Integer/Boolean array of values + mask (Union[zarr.Array, h5py.Dataset]): mask indicating which values are non-null + dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` + """ + self.values = values + self.mask = mask + self._dtype_str = dtype_str + + @property + def dtype(self) -> pd.CategoricalDtype: + if self.mask is not None: + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray + return pd.array + + def __getitem__(self, selection) -> pd.Categorical: + idx = selection + if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): + idx = selection.tuple[0] # need to understand this better + if isinstance(idx, int): + idx = slice(idx, idx + 1) + values = np.array(self.values[idx]) + if self.mask is not None: + mask = np.array(self.mask[idx]) + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray(values, mask=mask) + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray(values, mask=mask) + return pd.array(values) + + def __repr__(self) -> str: + if self._dtype_str == "nullable-integer": + return "LazyNullableIntegerArray" + elif self._dtype_str == "nullable-boolean": + return "LazyNullableBooleanArray" + + def copy(self) -> LazyMaskedArray: + """Returns a copy of this array which can then be safely edited + + Returns: + LazyMaskedArray: copied LazyMaskedArray + """ + arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) + return arr + + +@_subset.register(xr.DataArray) +def _subset_masked(a: xr.DataArray, subset_idx: Index): + return a[subset_idx] + + +@as_view.register(xr.DataArray) +def _view_pd_boolean_array(a: xr.DataArray, view_args): + return a diff --git a/anndata/experimental/backed/_xarray.py b/anndata/experimental/backed/_xarray.py new file mode 100644 index 000000000..84bdc0223 --- /dev/null +++ b/anndata/experimental/backed/_xarray.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import xarray as xr + +from anndata._core.index import Index, _subset +from anndata._core.views import as_view + + +def get_index_dim(ds): + assert ( + len(ds.dims) == 1 + ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" + return list(ds.dims.keys())[0] + + +class Dataset2D(xr.Dataset): + @property + def shape( + self, + ): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm + return [self.dims[get_index_dim(self)], len(self)] + + @property + def iloc(self): + class IlocGetter: + def __init__(self, ds): + self._ds = ds + + def __getitem__(self, idx): + coords = list(self._ds.coords.keys())[0] + return self._ds.isel(**{coords: idx}) + + return IlocGetter(self) + + +@_subset.register(Dataset2D) +def _(a: xr.DataArray, subset_idx: Index): + key = get_index_dim(a) + if ( + isinstance(subset_idx, tuple) and len(subset_idx) == 1 + ): # xarray seems to have some code looking for a second entry in tuples + return a.isel(**{key: subset_idx[0]}) + return a.isel(**{key: subset_idx}) + + +@as_view.register(Dataset2D) +def _(a: Dataset2D, view_args): + return a From 73489977f0c2d0855a4a6512a649fdfa6902e86c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Nov 2023 12:39:10 +0100 Subject: [PATCH 002/348] (feat): `backed_to_memory` function --- anndata/experimental/__init__.py | 2 + anndata/experimental/backed/__init__.py | 5 ++ anndata/experimental/backed/_io.py | 78 ++++++++++++++++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/anndata/experimental/__init__.py b/anndata/experimental/__init__.py index f189196da..d3355744b 100644 --- a/anndata/experimental/__init__.py +++ b/anndata/experimental/__init__.py @@ -5,6 +5,7 @@ from ._dispatch_io import read_dispatched, write_dispatched from .backed import read_backed +from .backed import to_memory as backed_to_memory from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader @@ -22,4 +23,5 @@ "CSRDataset", "CSCDataset", "read_backed", + "backed_to_memory", ] diff --git a/anndata/experimental/backed/__init__.py b/anndata/experimental/backed/__init__.py index e69de29bb..32aa48b77 100644 --- a/anndata/experimental/backed/__init__.py +++ b/anndata/experimental/backed/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from ._io import read_backed + +__all__ = ["read_backed", "backed_to_memory"] diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index 99420a6a9..ea2613977 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -5,6 +5,8 @@ TYPE_CHECKING, ) +import pandas as pd + if TYPE_CHECKING: from collections.abc import MutableMapping @@ -14,13 +16,87 @@ import zarr from ..._core.anndata import AnnData -from ..._core.sparse_dataset import sparse_dataset +from ..._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from ...compat import DaskArray +from ...utils import convert_to_dict from .. import read_dispatched from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray from ._xarray import Dataset2D +def to_memory(adata, exclude=[]): + # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object + def get_nullable_and_categorical_cols(ds): + cols = [] + for c in ds: + dtype = ds[c].dtype + if ( + isinstance(dtype, pd.CategoricalDtype) + or dtype == pd.arrays.BooleanArray + or dtype == pd.arrays.IntegerArray + ): + cols += [c] + return cols + + def to_df(ds, exclude_vars=[]): + nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) + drop_vars = [ + k for k in set(exclude_vars + nullable_and_categorical_df_cols) if k in ds + ] + df = ds.drop_vars(drop_vars).to_dataframe() + for c in nullable_and_categorical_df_cols: + if c not in exclude_vars: + df[c] = ds[c].data[()] + df.index.name = None # matches old AnnData object + if len(exclude_vars) == 0: + df = df[list(ds.keys())] + return df + + # handling for AxisArrays + def backed_dict_to_memory(d, prefix): + res = {} + for k, v in d.items(): + full_key = prefix + "/" + k + if any([full_key == exclude_key for exclude_key in exclude]): + continue + if isinstance(v, DaskArray): + res[k] = v.compute() + elif isinstance(v, BaseCompressedSparseDataset): + res[k] = v.to_memory() + elif isinstance(v, Dataset2D): + res[k] = to_df(v) + else: + res[k] = v + return res + + exclude_obs = [key.replace("obs/", "") for key in exclude if key.startswith("obs/")] + obs = to_df(adata.obs, exclude_obs) + exclude_var = [key.replace("var/", "") for key in exclude if key.startswith("var/")] + var = to_df(adata.var, exclude_var) + obsm = backed_dict_to_memory(convert_to_dict(adata.obsm), "obsm") + varm = backed_dict_to_memory(convert_to_dict(adata.varm), "varm") + varp = backed_dict_to_memory(convert_to_dict(adata.varp), "varp") + obsp = backed_dict_to_memory(convert_to_dict(adata.obsp), "obsp") + layers = backed_dict_to_memory(dict(adata.layers), "layers") + X = None + if "X" not in exclude: + if isinstance(adata.X, BaseCompressedSparseDataset): + X = adata.X.to_memory() + else: + X = adata.X.compute() + return AnnData( + X=X, + obs=obs, + var=var, + obsm=obsm, + varm=varm, + obsp=obsp, + varp=varp, + layers=layers, + uns=adata.uns, + ) + + def read_backed( store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, ) -> AnnData: From b9712c86e006498cde09f4f03fcee72258883d23 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Nov 2023 13:39:37 +0100 Subject: [PATCH 003/348] (chore): add comment about dropping cats --- anndata/_core/anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index af6d582da..9df84636d 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -426,7 +426,7 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): self._varp = adata_ref.varp._view(self, vidx) # fix categories uns = copy(adata_ref._uns) - # self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + # self._remove_unused_categories(adata_ref.obs, obs_sub, uns) # not going to work with xarray # self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = as_view(obs_sub, view_args=(self, "obs")) From 12e5c6827d1bc10edda0cf4e4ba5ae91e3b59885 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Nov 2023 13:40:08 +0100 Subject: [PATCH 004/348] (fix): add copy mechanism in `sparse_dataset` for `zarr` --- anndata/_core/sparse_dataset.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 360872bff..e0c70faad 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -16,6 +16,7 @@ import warnings from abc import ABC from itertools import accumulate, chain +from pathlib import Path from typing import TYPE_CHECKING, Literal, NamedTuple import h5py @@ -24,9 +25,8 @@ from scipy.sparse import _sparsetools from anndata._core.index import _fix_slice_bounds -from anndata.compat import H5Group, ZarrGroup -from ..compat import _read_attr +from ..compat import H5Group, ZarrArray, ZarrGroup, _read_attr try: # Not really important, just for IDEs to be more helpful @@ -57,6 +57,14 @@ class BackedSparseMatrix(_cs_matrix): def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): return sparse_dataset(self.data.parent).to_memory() + if isinstance(self.data, ZarrArray): + import zarr + + return sparse_dataset( + zarr.open( + store=self.data.store, path=Path(self.data.path).parent, mode="r" + ) + ).to_memory() else: return super().copy() From 97aa5a7652fa154f2d7d21f4074cb598c9c8bd52 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Nov 2023 13:40:20 +0100 Subject: [PATCH 005/348] (feat): always consolidate metadata --- anndata/_io/zarr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index 00f9766f0..0c4dd673b 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -46,6 +46,7 @@ def callback(func, s, k, elem, dataset_kwargs, iospec): func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) + zarr.consolidate_metadata(f.store) def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: From 93dec659dc241e716218358c0f613fde0b9028b1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Nov 2023 13:52:31 +0100 Subject: [PATCH 006/348] (feat): first pass at tests --- anndata/experimental/backed/__init__.py | 4 +- anndata/experimental/backed/_io.py | 8 +- anndata/experimental/backed/_lazy_arrays.py | 4 +- .../tests/test_read_backed_experimental.py | 429 ++++++++++++++++++ 4 files changed, 437 insertions(+), 8 deletions(-) create mode 100644 anndata/tests/test_read_backed_experimental.py diff --git a/anndata/experimental/backed/__init__.py b/anndata/experimental/backed/__init__.py index 32aa48b77..2e9b6519d 100644 --- a/anndata/experimental/backed/__init__.py +++ b/anndata/experimental/backed/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations -from ._io import read_backed +from ._io import read_backed, to_memory -__all__ = ["read_backed", "backed_to_memory"] +__all__ = ["read_backed", "to_memory"] diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index ea2613977..5446d98d1 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -82,8 +82,10 @@ def backed_dict_to_memory(d, prefix): if "X" not in exclude: if isinstance(adata.X, BaseCompressedSparseDataset): X = adata.X.to_memory() - else: + elif isinstance(adata.X, DaskArray): X = adata.X.compute() + else: + X = adata.X return AnnData( X=X, obs=obs, @@ -165,9 +167,7 @@ def callback(func, elem_name: str, elem, iospec): d_with_xr[k] = v return Dataset2D(d_with_xr) elif iospec.encoding_type == "categorical": - drop_unused_cats = not ( - elem_name.startswith("/obsm") or elem_name.startswith("/varm") - ) + drop_unused_cats = False # always don't because the `AnnData` object cannot drop them for us, so getting tests to pass means we need to leave this. return LazyCategoricalArray( elem["codes"], elem["categories"], elem.attrs, drop_unused_cats ) diff --git a/anndata/experimental/backed/_lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py index 59236fd75..7ee6adf44 100644 --- a/anndata/experimental/backed/_lazy_arrays.py +++ b/anndata/experimental/backed/_lazy_arrays.py @@ -41,7 +41,7 @@ class LazyCategoricalArray(MaskedArrayMixIn): "_drop_unused_cats", ) - def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs): + def __init__(self, codes, categories, attrs, drop_unused_cats, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. Args: @@ -54,7 +54,7 @@ def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs) self._categories = categories self._categories_cache = None self.attrs = dict(attrs) - self._drop_unused_cats = _drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? + self._drop_unused_cats = drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? @property def categories(self): # __slots__ and cached_property are incompatible diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py new file mode 100644 index 000000000..46b60f6e2 --- /dev/null +++ b/anndata/tests/test_read_backed_experimental.py @@ -0,0 +1,429 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest +import zarr +from scipy import sparse +from zarr import DirectoryStore + +from anndata._core.anndata import AnnData +from anndata.experimental import backed_to_memory, read_backed +from anndata.experimental.backed._lazy_arrays import ( + LazyCategoricalArray, + LazyMaskedArray, +) +from anndata.tests.helpers import ( + as_dense_dask_array, + assert_equal, + gen_adata, + gen_typed_df, +) + + +class AccessTrackingStore(DirectoryStore): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._access_count = {} + self._accessed_keys = set() + + def __getitem__(self, key): + for tracked in self._access_count: + if tracked in key: + # import traceback + # traceback.print_stack() + self._access_count[tracked] += 1 + self._accessed_keys.add(key) + return super().__getitem__(key) + + def get_access_count(self, key): + return self._access_count[key] + + def get_subkeys_accessed(self, key): + return [k for k in self._accessed_keys if key in k] + + def set_key_trackers(self, keys_to_track): + for k in keys_to_track: + self._access_count[k] = 0 + + +@pytest.fixture( + params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], + ids=["scipy-csr", "scipy-csc", "np-array", "dask_array"], +) +def mtx_format(request): + return request.param + + +@pytest.fixture(params=[sparse.csr_matrix, sparse.csc_matrix]) +def sparse_format(request): + return request.param + + +@pytest.fixture(params=["zarr", "h5ad"]) +def dskfmt(request): + return request.param + + +@pytest.fixture() +def categorical_lazy_arr(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["codes"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z["categories"] = np.array(["foo", "bar", "jazz"]) + z.attrs["ordered"] = False + z = zarr.open(base_path) + return LazyCategoricalArray(z["codes"], z["categories"], z.attrs, True) + + +@pytest.fixture() +def nullable_boolean_lazy_arr(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array( + [ + True, + False, + True, + False, + False, + True, + False, + False, + True, + True, + False, + False, + False, + True, + False, + True, + ] + ) + z["mask"] = np.array( + [ + True, + True, + True, + True, + True, + False, + False, + True, + False, + True, + True, + True, + True, + False, + True, + False, + ] + ) + z = zarr.open(base_path) + return LazyMaskedArray(z["values"], z["mask"], "nullable-boolean") + + +@pytest.fixture() +def nullable_boolean_lazy_arr_no_mask(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array( + [ + True, + False, + True, + False, + False, + True, + False, + False, + True, + True, + False, + False, + False, + True, + False, + True, + ] + ) + z = zarr.open(base_path) + return LazyMaskedArray(z["values"], None, "nullable-boolean") + + +@pytest.fixture() +def nullable_integer_lazy_arr(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z["mask"] = np.array( + [ + True, + True, + True, + True, + True, + False, + False, + True, + False, + True, + True, + True, + True, + False, + True, + False, + ] + ) + z = zarr.open(base_path) + return LazyMaskedArray(z["values"], z["mask"], "nullable-integer") + + +@pytest.fixture() +def nullable_integer_lazy_arr_no_mask(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z = zarr.open(base_path) + return LazyMaskedArray(z["values"], None, "nullable-integer") + + +def test_access_count_obs_var(tmp_path, mtx_format): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access + N = 5 + obs_names = pd.Index(f"cell{i}" for i in range(M)) + var_names = pd.Index(f"gene{i}" for i in range(N)) + obs = gen_typed_df(M, obs_names) + var = gen_typed_df(N, var_names) + orig = AnnData( + obs=obs, + var=var, + X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), + ) + orig.write_zarr(orig_pth) + store = AccessTrackingStore(orig_pth) + store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes", "X"]) + remote = read_backed(store) + # a series of methods that should __not__ read in any data + remote.X # the initial (non-subset) access to `X` should not read in data + remote.shape + remote.var + remote.obs + remote.obs["int64"] + remote.var["int64"] + # only the `cat` should be read in + subset = remote[ + (remote.obs["cat"] == "a").data, : + ] # `.data` for xarray, but should we handle internally? + subset.obs["int64"] + sub_subset = subset[0:10, :] + sub_subset.obs["int64"] + assert store.get_access_count("X") == 0, store.get_subkeys_accessed("X") + assert store.get_access_count("obs/int64") == 0, store.get_subkeys_accessed( + "obs/int64" + ) + assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( + "var/int64" + ) + # all codes read in for subset + assert store.get_access_count("obs/cat/codes") == 4, store.get_subkeys_accessed( + "obs/cat/codes" + ) + remote[0:10, :].obs["int64"][0:10].compute() + assert store.get_access_count("obs/int64") == 1, store.get_subkeys_accessed( + "obs/int64" + ) + # one for 0, .zmetadata handles .zarray + assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( + "var/int64" + ) # never accessed + + +def test_to_memory(tmp_path, mtx_format, dskfmt): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) + remote = read_backed(orig_pth) + remote_to_memory = backed_to_memory(remote) + assert_equal(remote_to_memory, adata) + + +def test_to_memory_exclude(tmp_path, mtx_format, dskfmt): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) + remote = read_backed(orig_pth) + remote_to_memory = backed_to_memory( + remote, exclude=["obs/nullable-bool", "obsm/sparse"] + ) + assert "nullable-bool" not in remote_to_memory.obs + assert "sparse" not in remote_to_memory.obsm + + +def test_view_to_memory(tmp_path, mtx_format, dskfmt): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) + remote = read_backed(orig_pth) + subset_obs = adata.obs["obs_cat"] == "a" + assert_equal(adata[subset_obs, :], backed_to_memory(remote[subset_obs, :])) + + subset_var = adata.var["var_cat"] == "a" + assert_equal(adata[:, subset_var], backed_to_memory(remote[:, subset_var])) + + +def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) + remote = read_backed(orig_pth) + subset_obs = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") + subsetted_adata = adata[subset_obs, :] + subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" + subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] + assert_equal( + subsetted_subsetted_adata, + backed_to_memory(remote[subset_obs, :][subset_subset_obs, :]), + ) + + subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") + subsetted_adata = adata[:, subset_var] + subset_subset_var = subsetted_adata.var["var_cat"] == "b" + subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] + assert_equal( + subsetted_subsetted_adata, + backed_to_memory(remote[:, subset_var][:, subset_subset_var]), + ) + + +def test_lazy_categorical_array_properties(categorical_lazy_arr): + assert len(categorical_lazy_arr[0:3]) == 3 + assert type(categorical_lazy_arr[0:3]) == pd.Categorical + assert len(categorical_lazy_arr[...]) == len(categorical_lazy_arr) + assert type(categorical_lazy_arr[...]) == pd.Categorical + + +def test_lazy_categorical_array_equality(categorical_lazy_arr): + assert (categorical_lazy_arr[0] == "foo").all() + assert (categorical_lazy_arr[3:5] == "bar").all() + assert (categorical_lazy_arr == "foo").any() + + +def test_lazy_categorical_array_subset_subset(categorical_lazy_arr): + subset_susbet = categorical_lazy_arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.Categorical + assert ( + subset_susbet[...] + == pd.Categorical.from_codes( + codes=[2, 2, 1, 2, 0], + categories=["foo", "bar", "jazz"], + ordered=False, + ).remove_unused_categories() + ).all() + + +def test_nullable_boolean_array_properties(nullable_boolean_lazy_arr): + assert len(nullable_boolean_lazy_arr[0:3]) == 3 + assert type(nullable_boolean_lazy_arr[0:3]) == pd.arrays.BooleanArray + assert len(nullable_boolean_lazy_arr[...]) == len(nullable_boolean_lazy_arr) + assert type(nullable_boolean_lazy_arr[...]) == pd.arrays.BooleanArray + + +def test_nullable_boolean_array_equality(nullable_boolean_lazy_arr): + assert (nullable_boolean_lazy_arr[0] == pd.NA).all() + assert (nullable_boolean_lazy_arr[3:5] == pd.NA).all() + assert (nullable_boolean_lazy_arr[5:7] == np.array([True, False])).all() + + +def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): + subset_susbet = nullable_boolean_lazy_arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.BooleanArray + assert ( + subset_susbet[...] + == pd.arrays.BooleanArray( + values=np.array([True, False, False, True, True]), + mask=np.array([False, False, True, False, True]), + ) + ).all() + + +def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): + assert nullable_boolean_lazy_arr_no_mask[0] == True # noqa + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() # noqa + assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() + + +def test_nullable_boolean_array_no_mask_subset_subset( + nullable_boolean_lazy_arr_no_mask, +): + subset_susbet = nullable_boolean_lazy_arr_no_mask[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.BooleanArray + assert ( + subset_susbet[...] + == pd.array( + np.array([True, False, False, True, True]), + ) + ).all() + + +def test_nullable_integer_array_properties(nullable_integer_lazy_arr): + assert len(nullable_integer_lazy_arr[0:3]) == 3 + assert type(nullable_integer_lazy_arr[0:3]) == pd.arrays.IntegerArray + assert len(nullable_integer_lazy_arr[...]) == len(nullable_integer_lazy_arr) + assert type(nullable_integer_lazy_arr[...]) == pd.arrays.IntegerArray + + +def test_nullable_integer_array_equality(nullable_integer_lazy_arr): + assert (nullable_integer_lazy_arr[0] == pd.NA).all() + assert (nullable_integer_lazy_arr[3:5] == pd.NA).all() + assert (nullable_integer_lazy_arr[5:7] == np.array([2, 2])).all() + + +def test_nullable_integer_array_subset_subset(nullable_integer_lazy_arr): + subset_susbet = nullable_integer_lazy_arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.IntegerArray + assert ( + subset_susbet[...] + == pd.arrays.IntegerArray( + values=np.array([2, 2, 1, 2, 0]), + mask=np.array([False, False, True, False, True]), + ) + ).all() + + +def test_nullable_integer_array_no_mask_equality(nullable_integer_lazy_arr_no_mask): + assert (nullable_integer_lazy_arr_no_mask[0] == pd.NA).all() + assert (nullable_integer_lazy_arr_no_mask[3:5] == 1).all() + assert (nullable_integer_lazy_arr_no_mask[5:7] == np.array([2, 2])).all() + + +def test_nullable_integer_array_no_mask_subset_subset( + nullable_integer_lazy_arr_no_mask, +): + subset_susbet = nullable_integer_lazy_arr_no_mask[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.IntegerArray + assert ( + subset_susbet[...] + == pd.array( + np.array([2, 2, 1, 2, 0]), + ) + ).all() From a702225b8759e74924244f2162a864874ea44769 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 22 Nov 2023 12:02:31 +0100 Subject: [PATCH 007/348] (fix): add `uns` and remove `consolidated` so tests pass --- anndata/_io/zarr.py | 1 - anndata/experimental/backed/_io.py | 16 ++++++++++++++-- anndata/tests/test_read_backed_experimental.py | 6 +++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index 0c4dd673b..00f9766f0 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -46,7 +46,6 @@ def callback(func, s, k, elem, dataset_kwargs, iospec): func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) - zarr.consolidate_metadata(f.store) def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index 5446d98d1..8c2a178b8 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -78,6 +78,7 @@ def backed_dict_to_memory(d, prefix): varp = backed_dict_to_memory(convert_to_dict(adata.varp), "varp") obsp = backed_dict_to_memory(convert_to_dict(adata.obsp), "obsp") layers = backed_dict_to_memory(dict(adata.layers), "layers") + uns = backed_dict_to_memory(convert_to_dict(adata.uns), "uns") X = None if "X" not in exclude: if isinstance(adata.X, BaseCompressedSparseDataset): @@ -95,7 +96,7 @@ def backed_dict_to_memory(d, prefix): obsp=obsp, varp=varp, layers=layers, - uns=adata.uns, + uns=uns, ) @@ -130,7 +131,18 @@ def read_backed( def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): - cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] + cols = [ + "obs", + "var", + "obsm", + "varm", + "obsp", + "varp", + "layers", + "X", + "raw", + "uns", + ] iter_object = ( elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] ) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 46b60f6e2..abcaa6014 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -22,6 +22,8 @@ gen_typed_df, ) +EXEMPT_STANDARD_ZARR_KEYS = {".zarray", ".zgroup", ".zattrs"} + class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): @@ -31,7 +33,9 @@ def __init__(self, *args, **kwargs): def __getitem__(self, key): for tracked in self._access_count: - if tracked in key: + if tracked in key and not any( + zarr_key in key for zarr_key in EXEMPT_STANDARD_ZARR_KEYS + ): # import traceback # traceback.print_stack() self._access_count[tracked] += 1 From bb0c63d1d11ac73fd9411f63b49ee19241cf3de6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Nov 2023 14:52:01 +0100 Subject: [PATCH 008/348] (refactor): single dispatch avoids reference to experimental --- anndata/_core/anndata.py | 88 +++++++++++--------------- anndata/experimental/backed/_xarray.py | 40 ++++++++++-- 2 files changed, 72 insertions(+), 56 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 9df84636d..aa6378e24 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -199,6 +199,32 @@ def _gen_dataframe_1d( raise ValueError(f"Cannot convert {type(anno)} to {attr} DataFrame") +@singledispatch +def _remove_unused_categories( + df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any] +): + for k in df_full: + if not isinstance(df_full[k].dtype, pd.CategoricalDtype): + continue + all_categories = df_full[k].cat.categories + with pd.option_context("mode.chained_assignment", None): + df_sub[k] = df_sub[k].cat.remove_unused_categories() + # also correct the colors... + color_key = f"{k}_colors" + if color_key not in uns: + continue + color_vec = uns[color_key] + if np.array(color_vec).ndim == 0: + # Make 0D arrays into 1D ones + uns[color_key] = np.array(color_vec)[(None,)] + elif len(color_vec) != len(all_categories): + # Reset colors + del uns[color_key] + else: + idx = np.where(np.in1d(all_categories, df_sub[k].cat.categories))[0] + uns[color_key] = np.array(color_vec)[(idx,)] + + class AnnData(metaclass=utils.DeprecationMixinMeta): """\ An annotated data matrix. @@ -353,20 +379,6 @@ def __init__( oidx: Index1D = None, vidx: Index1D = None, ): - if "Dataset2D" in str(type(obs)): - from ..experimental.backed._xarray import Dataset2D - - @_gen_dataframe.register(Dataset2D) - def _gen_dataframe_xr( - anno: Dataset2D, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, - ): - return anno - if asview: if not isinstance(X, AnnData): raise ValueError("`X` has to be an AnnData object.") @@ -426,8 +438,8 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): self._varp = adata_ref.varp._view(self, vidx) # fix categories uns = copy(adata_ref._uns) - # self._remove_unused_categories(adata_ref.obs, obs_sub, uns) # not going to work with xarray - # self._remove_unused_categories(adata_ref.var, var_sub, uns) + self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = as_view(obs_sub, view_args=(self, "obs")) self._var = as_view(var_sub, view_args=(self, "var")) @@ -584,7 +596,7 @@ def _init_as_actual( _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() - # self._check_uniqueness() + self._check_uniqueness() if self.filename: assert not isinstance( @@ -939,13 +951,7 @@ def obs(self): @property def obs_names(self) -> pd.Index: """Names of observations (alias for `.obs.index`).""" - if hasattr(self.obs, "index"): - return self.obs.index - return pd.Index( - self.obs["obs_names"].data.compute() - if isinstance(self.obs["obs_names"].data, DaskArray) - else self.obs["obs_names"].data - ) + return self.obs.index @obs_names.setter def obs_names(self, names: Sequence[str]): @@ -968,13 +974,7 @@ def var(self): @property def var_names(self) -> pd.Index: """Names of variables (alias for `.var.index`).""" - if hasattr(self.var, "index"): - return self.var.index - return pd.Index( - self.var["var_names"].data.compute() - if isinstance(self.var["var_names"].data, DaskArray) - else self.var["var_names"].data - ) + return self.var.index @var_names.setter def var_names(self, names: Sequence[str]): @@ -1203,28 +1203,12 @@ def __getitem__(self, index: Index) -> AnnData: return AnnData(self, oidx=oidx, vidx=vidx, asview=True) def _remove_unused_categories( - self, df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any] + self, + df_full: pd.DataFrame, + df_sub: pd.DataFrame, + uns: dict[str, Any], # types are wrong now... ): - for k in df_full: - if not isinstance(df_full[k].dtype, pd.CategoricalDtype): - continue - all_categories = df_full[k].cat.categories - with pd.option_context("mode.chained_assignment", None): - df_sub[k] = df_sub[k].cat.remove_unused_categories() - # also correct the colors... - color_key = f"{k}_colors" - if color_key not in uns: - continue - color_vec = uns[color_key] - if np.array(color_vec).ndim == 0: - # Make 0D arrays into 1D ones - uns[color_key] = np.array(color_vec)[(None,)] - elif len(color_vec) != len(all_categories): - # Reset colors - del uns[color_key] - else: - idx = np.where(np.in1d(all_categories, df_sub[k].cat.categories))[0] - uns[color_key] = np.array(color_vec)[(idx,)] + _remove_unused_categories(df_full, df_sub, uns) def rename_categories(self, key: str, categories: Sequence[Any]): """\ diff --git a/anndata/experimental/backed/_xarray.py b/anndata/experimental/backed/_xarray.py index 84bdc0223..7ca971e79 100644 --- a/anndata/experimental/backed/_xarray.py +++ b/anndata/experimental/backed/_xarray.py @@ -1,9 +1,17 @@ from __future__ import annotations +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + from collections.abc import Iterable + + +import pandas as pd import xarray as xr -from anndata._core.index import Index, _subset -from anndata._core.views import as_view +from ..._core.anndata import _gen_dataframe, _remove_unused_categories +from ..._core.index import Index, _subset +from ..._core.views import as_view def get_index_dim(ds): @@ -14,6 +22,11 @@ def get_index_dim(ds): class Dataset2D(xr.Dataset): + @property + def index(self) -> pd.Index: + coord = list(self.coords.keys())[0] + return pd.Index(self.coords[coord].data) + @property def shape( self, @@ -27,8 +40,8 @@ def __init__(self, ds): self._ds = ds def __getitem__(self, idx): - coords = list(self._ds.coords.keys())[0] - return self._ds.isel(**{coords: idx}) + coord = list(self._ds.coords.keys())[0] + return self._ds.isel(**{coord: idx}) return IlocGetter(self) @@ -46,3 +59,22 @@ def _(a: xr.DataArray, subset_idx: Index): @as_view.register(Dataset2D) def _(a: Dataset2D, view_args): return a + + +@_gen_dataframe.register(Dataset2D) +def _gen_dataframe_xr( + anno: Dataset2D, + index_names: Iterable[str], + *, + source: Literal["X", "shape"], + attr: Literal["obs", "var"], + length: int | None = None, +): + return anno + + +@_remove_unused_categories.register(Dataset2D) +def _remove_unused_categories_xr( + df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any] +): + pass # for now? From 86cc5539504d4747f70fb573c704d6a1cdea1e9c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Nov 2023 15:48:42 +0100 Subject: [PATCH 009/348] (fix): category removal compat --- anndata/experimental/backed/_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index 8c2a178b8..cc4702387 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -179,7 +179,9 @@ def callback(func, elem_name: str, elem, iospec): d_with_xr[k] = v return Dataset2D(d_with_xr) elif iospec.encoding_type == "categorical": - drop_unused_cats = False # always don't because the `AnnData` object cannot drop them for us, so getting tests to pass means we need to leave this. + drop_unused_cats = not ( + elem_name.startswith("/obsm") or elem_name.startswith("/varm") + ) return LazyCategoricalArray( elem["codes"], elem["categories"], elem.attrs, drop_unused_cats ) From bfd827d79467fe11671cf713f5418b3740671c69 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Nov 2023 16:06:33 +0100 Subject: [PATCH 010/348] (feat): remove exclusion and use `to_memory` single_dispatch --- anndata/experimental/__init__.py | 2 - anndata/experimental/backed/__init__.py | 4 +- anndata/experimental/backed/_io.py | 81 +------------------ anndata/experimental/backed/_xarray.py | 21 +++++ .../tests/test_read_backed_experimental.py | 26 ++---- 5 files changed, 30 insertions(+), 104 deletions(-) diff --git a/anndata/experimental/__init__.py b/anndata/experimental/__init__.py index d3355744b..f189196da 100644 --- a/anndata/experimental/__init__.py +++ b/anndata/experimental/__init__.py @@ -5,7 +5,6 @@ from ._dispatch_io import read_dispatched, write_dispatched from .backed import read_backed -from .backed import to_memory as backed_to_memory from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader @@ -23,5 +22,4 @@ "CSRDataset", "CSCDataset", "read_backed", - "backed_to_memory", ] diff --git a/anndata/experimental/backed/__init__.py b/anndata/experimental/backed/__init__.py index 2e9b6519d..4239f2293 100644 --- a/anndata/experimental/backed/__init__.py +++ b/anndata/experimental/backed/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations -from ._io import read_backed, to_memory +from ._io import read_backed -__all__ = ["read_backed", "to_memory"] +__all__ = ["read_backed"] diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index cc4702387..96ff09efb 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -5,8 +5,6 @@ TYPE_CHECKING, ) -import pandas as pd - if TYPE_CHECKING: from collections.abc import MutableMapping @@ -16,90 +14,13 @@ import zarr from ..._core.anndata import AnnData -from ..._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset +from ..._core.sparse_dataset import sparse_dataset from ...compat import DaskArray -from ...utils import convert_to_dict from .. import read_dispatched from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray from ._xarray import Dataset2D -def to_memory(adata, exclude=[]): - # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object - def get_nullable_and_categorical_cols(ds): - cols = [] - for c in ds: - dtype = ds[c].dtype - if ( - isinstance(dtype, pd.CategoricalDtype) - or dtype == pd.arrays.BooleanArray - or dtype == pd.arrays.IntegerArray - ): - cols += [c] - return cols - - def to_df(ds, exclude_vars=[]): - nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) - drop_vars = [ - k for k in set(exclude_vars + nullable_and_categorical_df_cols) if k in ds - ] - df = ds.drop_vars(drop_vars).to_dataframe() - for c in nullable_and_categorical_df_cols: - if c not in exclude_vars: - df[c] = ds[c].data[()] - df.index.name = None # matches old AnnData object - if len(exclude_vars) == 0: - df = df[list(ds.keys())] - return df - - # handling for AxisArrays - def backed_dict_to_memory(d, prefix): - res = {} - for k, v in d.items(): - full_key = prefix + "/" + k - if any([full_key == exclude_key for exclude_key in exclude]): - continue - if isinstance(v, DaskArray): - res[k] = v.compute() - elif isinstance(v, BaseCompressedSparseDataset): - res[k] = v.to_memory() - elif isinstance(v, Dataset2D): - res[k] = to_df(v) - else: - res[k] = v - return res - - exclude_obs = [key.replace("obs/", "") for key in exclude if key.startswith("obs/")] - obs = to_df(adata.obs, exclude_obs) - exclude_var = [key.replace("var/", "") for key in exclude if key.startswith("var/")] - var = to_df(adata.var, exclude_var) - obsm = backed_dict_to_memory(convert_to_dict(adata.obsm), "obsm") - varm = backed_dict_to_memory(convert_to_dict(adata.varm), "varm") - varp = backed_dict_to_memory(convert_to_dict(adata.varp), "varp") - obsp = backed_dict_to_memory(convert_to_dict(adata.obsp), "obsp") - layers = backed_dict_to_memory(dict(adata.layers), "layers") - uns = backed_dict_to_memory(convert_to_dict(adata.uns), "uns") - X = None - if "X" not in exclude: - if isinstance(adata.X, BaseCompressedSparseDataset): - X = adata.X.to_memory() - elif isinstance(adata.X, DaskArray): - X = adata.X.compute() - else: - X = adata.X - return AnnData( - X=X, - obs=obs, - var=var, - obsm=obsm, - varm=varm, - obsp=obsp, - varp=varp, - layers=layers, - uns=uns, - ) - - def read_backed( store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, ) -> AnnData: diff --git a/anndata/experimental/backed/_xarray.py b/anndata/experimental/backed/_xarray.py index 7ca971e79..3152ddff9 100644 --- a/anndata/experimental/backed/_xarray.py +++ b/anndata/experimental/backed/_xarray.py @@ -10,6 +10,7 @@ import xarray as xr from ..._core.anndata import _gen_dataframe, _remove_unused_categories +from ..._core.file_backing import to_memory from ..._core.index import Index, _subset from ..._core.views import as_view @@ -78,3 +79,23 @@ def _remove_unused_categories_xr( df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any] ): pass # for now? + + +@to_memory.register(Dataset2D) +def to_memory(ds: Dataset2D, copy=False): + # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object + def get_nullable_and_categorical_cols(ds): + for c in ds: + dtype = ds[c].dtype + if ( + isinstance(dtype, pd.CategoricalDtype) + or dtype == pd.arrays.BooleanArray + or dtype == pd.arrays.IntegerArray + ): + yield c + + df = ds.to_dataframe()[list(ds.keys())] + for c in get_nullable_and_categorical_cols(ds): + df[c] = ds[c].data[()] + df.index.name = None # matches old AnnData object + return df diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index abcaa6014..4414c6d83 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -10,7 +10,7 @@ from zarr import DirectoryStore from anndata._core.anndata import AnnData -from anndata.experimental import backed_to_memory, read_backed +from anndata.experimental import read_backed from anndata.experimental.backed._lazy_arrays import ( LazyCategoricalArray, LazyMaskedArray, @@ -256,24 +256,10 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) write(adata) remote = read_backed(orig_pth) - remote_to_memory = backed_to_memory(remote) + remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) -def test_to_memory_exclude(tmp_path, mtx_format, dskfmt): - adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / f"orig.{dskfmt}" - write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) - write(adata) - remote = read_backed(orig_pth) - remote_to_memory = backed_to_memory( - remote, exclude=["obs/nullable-bool", "obsm/sparse"] - ) - assert "nullable-bool" not in remote_to_memory.obs - assert "sparse" not in remote_to_memory.obsm - - def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -282,10 +268,10 @@ def test_view_to_memory(tmp_path, mtx_format, dskfmt): write(adata) remote = read_backed(orig_pth) subset_obs = adata.obs["obs_cat"] == "a" - assert_equal(adata[subset_obs, :], backed_to_memory(remote[subset_obs, :])) + assert_equal(adata[subset_obs, :], remote[subset_obs, :].to_memory()) subset_var = adata.var["var_cat"] == "a" - assert_equal(adata[:, subset_var], backed_to_memory(remote[:, subset_var])) + assert_equal(adata[:, subset_var], remote[:, subset_var].to_memory()) def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): @@ -301,7 +287,7 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] assert_equal( subsetted_subsetted_adata, - backed_to_memory(remote[subset_obs, :][subset_subset_obs, :]), + remote[subset_obs, :][subset_subset_obs, :].to_memory(), ) subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") @@ -310,7 +296,7 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] assert_equal( subsetted_subsetted_adata, - backed_to_memory(remote[:, subset_var][:, subset_subset_var]), + remote[:, subset_var][:, subset_subset_var].to_memory(), ) From 0904e19fe7d7219488dc80679f3b933307239756 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Nov 2023 16:16:24 +0100 Subject: [PATCH 011/348] (chore): fix comment --- anndata/experimental/backed/_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/backed/_xarray.py b/anndata/experimental/backed/_xarray.py index 3152ddff9..5927d3849 100644 --- a/anndata/experimental/backed/_xarray.py +++ b/anndata/experimental/backed/_xarray.py @@ -78,7 +78,7 @@ def _gen_dataframe_xr( def _remove_unused_categories_xr( df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any] ): - pass # for now? + pass # this is handled automatically by the categorical arrays themselves i.e., they dedup upon access. @to_memory.register(Dataset2D) From d16cfbfeafe0c7073a16ab0ca2c8b9ba8e69dd3e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Nov 2023 16:23:33 +0100 Subject: [PATCH 012/348] (chore): update docstring --- anndata/experimental/backed/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index 96ff09efb..ec80e544a 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -24,7 +24,7 @@ def read_backed( store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, ) -> AnnData: - """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. + """Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. Args: From 47cd2d2726c6a5b3c3a56942df26b872adbdbab2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 28 Nov 2023 09:35:57 +0100 Subject: [PATCH 013/348] (fix): add `chunk_store` arg --- anndata/_core/sparse_dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index e0c70faad..bc82fb2d8 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -62,8 +62,10 @@ def copy(self) -> ss.spmatrix: return sparse_dataset( zarr.open( - store=self.data.store, path=Path(self.data.path).parent, mode="r" - ) + store=self.data.store, + mode="r", + chunk_store=self.data.chunk_store, # chunk_store is needed, not clear why + )[Path(self.data.path).parent] ).to_memory() else: return super().copy() From 3c60fe1bedc641854713da81d751dd3f09de7e19 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 7 Dec 2023 17:15:36 +0100 Subject: [PATCH 014/348] Add xarray to test dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index fc88e76a6..b2db0bae8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,7 @@ test = [ "awkward>=2.3", "pyarrow", "pytest_memray", + "xarray", ] gpu = ["cupy"] From bb99c6b62c6811a8a8ade948c78e4f26edff8164 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:08:30 +0000 Subject: [PATCH 015/348] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/sparse_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index eeee769aa..b05551ef4 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -17,7 +17,6 @@ from abc import ABC from functools import cached_property from itertools import accumulate, chain - from math import floor from pathlib import Path from typing import TYPE_CHECKING, Literal, NamedTuple From e773473c5cba136281140dc031c4c8a65068e6b5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 13 Feb 2024 13:29:28 +0100 Subject: [PATCH 016/348] (wip): try new backend arrays --- anndata/experimental/backed/_io.py | 24 ++- anndata/experimental/backed/_lazy_arrays.py | 192 +++++------------- anndata/experimental/backed/_xarray.py | 15 +- .../tests/test_read_backed_experimental.py | 19 +- pyproject.toml | 4 +- 5 files changed, 87 insertions(+), 167 deletions(-) diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index ec80e544a..9eec44bfe 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -17,7 +17,7 @@ from ..._core.sparse_dataset import sparse_dataset from ...compat import DaskArray from .. import read_dispatched -from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray +from ._lazy_arrays import CategoricalArray, MaskedArray from ._xarray import Dataset2D @@ -84,10 +84,13 @@ def callback(func, elem_name: str, elem, iospec): v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k ) elif ( - type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray + type(v) == CategoricalArray or type(v) == MaskedArray ) and k != elem.attrs["_index"]: + variable = xr.Variable( + data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] + ) d_with_xr[k] = xr.DataArray( - xr.core.indexing.LazilyIndexedArray(v), + variable, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k, @@ -103,14 +106,17 @@ def callback(func, elem_name: str, elem, iospec): drop_unused_cats = not ( elem_name.startswith("/obsm") or elem_name.startswith("/varm") ) - return LazyCategoricalArray( - elem["codes"], elem["categories"], elem.attrs, drop_unused_cats + return CategoricalArray( + codes=elem["codes"], + categories=elem["categories"], + ordered=elem.attrs["ordered"], + drop_unused_cats=drop_unused_cats, ) elif "nullable" in iospec.encoding_type: - return LazyMaskedArray( - elem["values"], - elem["mask"] if "mask" in elem else None, - iospec.encoding_type, + return MaskedArray( + values=elem["values"], + mask=elem["mask"] if "mask" in elem else None, + dtype_str=iospec.encoding_type, ) elif iospec.encoding_type in {"array", "string-array"}: if is_h5: diff --git a/anndata/experimental/backed/_lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py index 7ee6adf44..074003fa1 100644 --- a/anndata/experimental/backed/_lazy_arrays.py +++ b/anndata/experimental/backed/_lazy_arrays.py @@ -1,60 +1,39 @@ from __future__ import annotations -import numpy as np +from typing import TYPE_CHECKING + import pandas as pd import xarray as xr -from xarray.core.indexing import ( - BasicIndexer, - ExplicitlyIndexedNDArrayMixin, - OuterIndexer, -) from anndata._core.index import Index, _subset from anndata._core.views import as_view -from anndata.compat import ZarrArray - - -class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): - def __eq__(self, __o) -> np.ndarray: - return self[...] == __o - - def __ne__(self, __o) -> np.ndarray: - return ~(self == __o) - - @property - def shape(self) -> tuple[int, ...]: - """Shape of this array - - Returns: - Tuple[int, ...]: A shape that looks like a 1-d shape i.e., (#, ) - """ - return self.values.shape - - -class LazyCategoricalArray(MaskedArrayMixIn): - __slots__ = ( - "values", - "attrs", - "_categories", - "_categories_cache", - "group", - "_drop_unused_cats", - ) - - def __init__(self, codes, categories, attrs, drop_unused_cats, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. - - Args: - codes (Union[zarr.Array, h5py.Dataset]): values (integers) of the array, one for each element - categories (Union[zarr.Array, h5py.Dataset]): mappings from values to strings - attrs (Union[zarr.Array, h5py.Dataset]): attrs containing boolean "ordered" - _drop_unused_cats (bool): Whether or not to drop unused categories. - """ - self.values = codes +from anndata.compat import H5Array, ZarrArray + +if TYPE_CHECKING: + import numpy as np + + +class CategoricalArray( + xr.backends.zarr.ZarrArrayWrapper +): # Zarr works for hdf5, xarray only supports integration hdf5 in the netcdf context + def __init__( + self, + codes: ZarrArray | H5Array, + categories: ZarrArray | H5Array, + ordered: bool, + drop_unused_cats: bool, + *args, + **kwargs, + ): self._categories = categories + self._ordered = ordered + self._drop_unused_cats = drop_unused_cats self._categories_cache = None - self.attrs = dict(attrs) - self._drop_unused_cats = drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? + self._array = codes + self.shape = self._array.shape + self.dtype = pd.CategoricalDtype( + categories=self._categories, ordered=self._ordered + ) @property def categories(self): # __slots__ and cached_property are incompatible @@ -69,101 +48,42 @@ def categories(self): # __slots__ and cached_property are incompatible self._categories_cache = read_dataset(self._categories) return self._categories_cache - @property - def dtype(self) -> pd.CategoricalDtype: - return pd.CategoricalDtype(self.categories, self.ordered) - - @property - def ordered(self): - return bool(self.attrs["ordered"]) - - def __getitem__(self, selection) -> pd.Categorical: - idx = selection - if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): - idx = selection.tuple[0] # need to better understand this - if isinstance(self.values, ZarrArray): - codes = self.values.oindex[idx] - else: - codes = self.values[idx] - if codes.shape == (): # handle 0d case - codes = np.array([codes]) - res = pd.Categorical.from_codes( - codes=codes, - categories=self.categories, - ordered=self.ordered, + def __getitem__(self, key: xr.core.indexing.ExplicitIndexer) -> np.typing.ArrayLike: + codes = super().__getitem__(key) + categorical_array = pd.Categorical.from_codes( + codes=codes, categories=self.categories, ordered=self._ordered ) if self._drop_unused_cats: - return res.remove_unused_categories() - return res - - def __repr__(self) -> str: - return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - - def copy(self) -> LazyCategoricalArray: - """Returns a copy of this array which can then be safely edited - - Returns: - LazyCategoricalArray: copied LazyCategoricalArray - """ - arr = LazyCategoricalArray( - self.values, self._categories, self.attrs - ) # self.categories reads in data - return arr - - -class LazyMaskedArray(MaskedArrayMixIn): - __slots__ = ("mask", "values", "_dtype_str") - - def __init__(self, values, mask, dtype_str, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. - - Args: - values (Union[zarr.Array, h5py.Dataset]): Integer/Boolean array of values - mask (Union[zarr.Array, h5py.Dataset]): mask indicating which values are non-null - dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` - """ - self.values = values - self.mask = mask + return categorical_array.remove_unused_categories() + return categorical_array + + +class MaskedArray(xr.backends.zarr.ZarrArrayWrapper): + def __init__( + self, + values: ZarrArray | H5Array, + dtype_str: str, + *args, + mask: ZarrArray | H5Array | None = None, + **kwargs, + ): + self._mask = mask + self._values = values self._dtype_str = dtype_str - - @property - def dtype(self) -> pd.CategoricalDtype: - if self.mask is not None: - if self._dtype_str == "nullable-integer": - return pd.arrays.IntegerArray - elif self._dtype_str == "nullable-boolean": - return pd.arrays.BooleanArray - return pd.array - - def __getitem__(self, selection) -> pd.Categorical: - idx = selection - if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): - idx = selection.tuple[0] # need to understand this better - if isinstance(idx, int): - idx = slice(idx, idx + 1) - values = np.array(self.values[idx]) - if self.mask is not None: - mask = np.array(self.mask[idx]) + self._array = values + self.shape = self._array.shape + self.dtype = pd.api.types.pandas_dtype(self._array.dtype) + + def __getitem__(self, key): + values = super().__getitem__(key) + if self._mask is not None: + mask = self._mask[key] if self._dtype_str == "nullable-integer": + # numpy does not support nan ints return pd.arrays.IntegerArray(values, mask=mask) elif self._dtype_str == "nullable-boolean": return pd.arrays.BooleanArray(values, mask=mask) - return pd.array(values) - - def __repr__(self) -> str: - if self._dtype_str == "nullable-integer": - return "LazyNullableIntegerArray" - elif self._dtype_str == "nullable-boolean": - return "LazyNullableBooleanArray" - - def copy(self) -> LazyMaskedArray: - """Returns a copy of this array which can then be safely edited - - Returns: - LazyMaskedArray: copied LazyMaskedArray - """ - arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) - return arr + return values @_subset.register(xr.DataArray) diff --git a/anndata/experimental/backed/_xarray.py b/anndata/experimental/backed/_xarray.py index 5927d3849..3513a8035 100644 --- a/anndata/experimental/backed/_xarray.py +++ b/anndata/experimental/backed/_xarray.py @@ -83,19 +83,6 @@ def _remove_unused_categories_xr( @to_memory.register(Dataset2D) def to_memory(ds: Dataset2D, copy=False): - # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object - def get_nullable_and_categorical_cols(ds): - for c in ds: - dtype = ds[c].dtype - if ( - isinstance(dtype, pd.CategoricalDtype) - or dtype == pd.arrays.BooleanArray - or dtype == pd.arrays.IntegerArray - ): - yield c - - df = ds.to_dataframe()[list(ds.keys())] - for c in get_nullable_and_categorical_cols(ds): - df[c] = ds[c].data[()] + df = ds.to_dataframe() df.index.name = None # matches old AnnData object return df diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 4414c6d83..0ad8db19c 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -12,8 +12,8 @@ from anndata._core.anndata import AnnData from anndata.experimental import read_backed from anndata.experimental.backed._lazy_arrays import ( - LazyCategoricalArray, - LazyMaskedArray, + CategoricalArray, + MaskedArray, ) from anndata.tests.helpers import ( as_dense_dask_array, @@ -79,7 +79,12 @@ def categorical_lazy_arr(tmp_path_factory): z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) - return LazyCategoricalArray(z["codes"], z["categories"], z.attrs, True) + return CategoricalArray( + codes=z["codes"], + categories=z["categories"], + ordered=z.attrs["ordered"], + drop_unused_categories=True, + ) @pytest.fixture() @@ -127,7 +132,7 @@ def nullable_boolean_lazy_arr(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z["values"], z["mask"], "nullable-boolean") + return MaskedArray(values=z["values"], mask=z["mask"], dtype_str="nullable-boolean") @pytest.fixture() @@ -155,7 +160,7 @@ def nullable_boolean_lazy_arr_no_mask(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z["values"], None, "nullable-boolean") + return MaskedArray(values=z["values"], mask=None, dtype_str="nullable-boolean") @pytest.fixture() @@ -184,7 +189,7 @@ def nullable_integer_lazy_arr(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z["values"], z["mask"], "nullable-integer") + return MaskedArray(values=z["values"], mask=z["mask"], dtype_str="nullable-integer") @pytest.fixture() @@ -193,7 +198,7 @@ def nullable_integer_lazy_arr_no_mask(tmp_path_factory): z = zarr.open_group(base_path, mode="w") z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z = zarr.open(base_path) - return LazyMaskedArray(z["values"], None, "nullable-integer") + return MaskedArray(values=z["values"], mask=None, dtype_str="nullable-integer") def test_access_count_obs_var(tmp_path, mtx_format): diff --git a/pyproject.toml b/pyproject.toml index f4780ce2a..ad44a17c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ test = [ "awkward>=2.3", "pyarrow", "pytest_memray", - "xarray", + "xarray@git+https://github.com/ilan-gold/xarray#egg=extension_arrays", ] gpu = ["cupy"] @@ -105,6 +105,8 @@ exclude = ["anndata/tests/test_*.py", "anndata/tests/data"] source = "vcs" [tool.hatch.build.hooks.vcs] version-file = "anndata/_version.py" +[tool.hatch.metadata] +allow-direct-references = true [tool.coverage.run] source = ["anndata"] From 6672c83bbb9076c947b567bb5718f1bd67a49283 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 14 Feb 2024 16:13:43 +0100 Subject: [PATCH 017/348] (hack!): workaround for no init --- anndata/experimental/backed/_lazy_arrays.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/anndata/experimental/backed/_lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py index 074003fa1..819a2baff 100644 --- a/anndata/experimental/backed/_lazy_arrays.py +++ b/anndata/experimental/backed/_lazy_arrays.py @@ -54,8 +54,10 @@ def __getitem__(self, key: xr.core.indexing.ExplicitIndexer) -> np.typing.ArrayL codes=codes, categories=self.categories, ordered=self._ordered ) if self._drop_unused_cats: - return categorical_array.remove_unused_categories() - return categorical_array + return xr.core.indexing.ExtensionDuckArray( + categorical_array.remove_unused_categories() + ) + return xr.core.indexing.ExtensionDuckArray(categorical_array) class MaskedArray(xr.backends.zarr.ZarrArrayWrapper): @@ -75,15 +77,22 @@ def __init__( self.dtype = pd.api.types.pandas_dtype(self._array.dtype) def __getitem__(self, key): + # HACK! TODO(ilan-gold): open issue about hdf5 compat that doesn't allow initialization! + self._array = self._values values = super().__getitem__(key) if self._mask is not None: - mask = self._mask[key] + self._array = self._mask + mask = super().__getitem__(key) if self._dtype_str == "nullable-integer": # numpy does not support nan ints - return pd.arrays.IntegerArray(values, mask=mask) + return xr.core.indexing.ExtensionDuckArray( + pd.arrays.IntegerArray(values, mask=mask) + ) elif self._dtype_str == "nullable-boolean": - return pd.arrays.BooleanArray(values, mask=mask) - return values + return xr.core.indexing.ExtensionDuckArray( + pd.arrays.BooleanArray(values, mask=mask) + ) + return xr.core.indexing.ExtensionDuckArray(pd.array(values)) @_subset.register(xr.DataArray) From ae17ba25ef37ee637ccff655d981bec8e7fc1718 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 15 Feb 2024 11:57:44 +0100 Subject: [PATCH 018/348] (feat): add more `AccessTrackingStore` funtionality --- anndata/tests/helpers.py | 10 ++++++++++ anndata/tests/test_read_backed_experimental.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 4fb33c039..863be3d0d 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -3,6 +3,7 @@ import random import re import warnings +from collections import defaultdict from collections.abc import Collection, Mapping from contextlib import contextmanager from functools import partial, singledispatch, wraps @@ -765,19 +766,28 @@ class AccessTrackingStore(zarr.DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = {} + self._accessed = defaultdict(set) def __getitem__(self, key): for tracked in self._access_count: if tracked in key: self._access_count[tracked] += 1 + self._accessed[tracked].add(key) return super().__getitem__(key) def get_access_count(self, key): return self._access_count[key] def set_key_trackers(self, keys_to_track): + if isinstance(keys_to_track, str): + keys_to_track = [keys_to_track] for k in keys_to_track: self._access_count[k] = 0 + + def get_subkeys_accessed(self, key): + return self._accessed[key] + + except ImportError: class AccessTrackingStore: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 0ad8db19c..60788ce21 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -226,7 +226,9 @@ def test_access_count_obs_var(tmp_path, mtx_format): remote.obs remote.obs["int64"] remote.var["int64"] - # only the `cat` should be read in + assert store.get_access_count("obs/cat/codes") == 0, store.get_subkeys_accessed( + "obs/cat/codes" + ) subset = remote[ (remote.obs["cat"] == "a").data, : ] # `.data` for xarray, but should we handle internally? From 118d6024017b9141ff0c452f4d347d7400247da2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 16 Feb 2024 13:46:34 +0100 Subject: [PATCH 019/348] (fix): zarr anndata now passing. --- anndata/_io/zarr.py | 1 + anndata/experimental/backed/_io.py | 4 +++ .../tests/test_read_backed_experimental.py | 32 +------------------ pyproject.toml | 2 +- 4 files changed, 7 insertions(+), 32 deletions(-) diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index 864475848..5b292b75c 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -46,6 +46,7 @@ def callback(func, s, k, elem, dataset_kwargs, iospec): func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) + zarr.convenience.consolidate_metadata(f.store) def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py index 9eec44bfe..3067ff6a2 100644 --- a/anndata/experimental/backed/_io.py +++ b/anndata/experimental/backed/_io.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from pathlib import Path from typing import ( TYPE_CHECKING, @@ -45,6 +46,9 @@ def read_backed( try: f = zarr.open_consolidated(store, mode="r") except KeyError: + warnings.warn( + "Did not read zarr as consolidated. Consider consolidating your metadata." + ) has_keys = False f = zarr.open(store, mode="r") else: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 60788ce21..c9dd505e9 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -7,7 +7,6 @@ import pytest import zarr from scipy import sparse -from zarr import DirectoryStore from anndata._core.anndata import AnnData from anndata.experimental import read_backed @@ -16,42 +15,13 @@ MaskedArray, ) from anndata.tests.helpers import ( + AccessTrackingStore, as_dense_dask_array, assert_equal, gen_adata, gen_typed_df, ) -EXEMPT_STANDARD_ZARR_KEYS = {".zarray", ".zgroup", ".zattrs"} - - -class AccessTrackingStore(DirectoryStore): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._access_count = {} - self._accessed_keys = set() - - def __getitem__(self, key): - for tracked in self._access_count: - if tracked in key and not any( - zarr_key in key for zarr_key in EXEMPT_STANDARD_ZARR_KEYS - ): - # import traceback - # traceback.print_stack() - self._access_count[tracked] += 1 - self._accessed_keys.add(key) - return super().__getitem__(key) - - def get_access_count(self, key): - return self._access_count[key] - - def get_subkeys_accessed(self, key): - return [k for k in self._accessed_keys if key in k] - - def set_key_trackers(self, keys_to_track): - for k in keys_to_track: - self._access_count[k] = 0 - @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], diff --git a/pyproject.toml b/pyproject.toml index ad44a17c6..d3490af42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ test = [ "awkward>=2.3", "pyarrow", "pytest_memray", - "xarray@git+https://github.com/ilan-gold/xarray#egg=extension_arrays", + "xarray@git+https://github.com/ilan-gold/xarray#egg=anndata_dev_branch", ] gpu = ["cupy"] From 76b353b19b3f26413aa28b0a313806e947e772a9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 22 Feb 2024 12:24:00 +0100 Subject: [PATCH 020/348] (fix): allow hdf5 arrays --- anndata/experimental/backed/_lazy_arrays.py | 58 ++++++++++++++------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/anndata/experimental/backed/_lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py index 819a2baff..5fad71deb 100644 --- a/anndata/experimental/backed/_lazy_arrays.py +++ b/anndata/experimental/backed/_lazy_arrays.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from functools import singledispatchmethod +from typing import TYPE_CHECKING, Generic, TypeVar, Union import pandas as pd import xarray as xr @@ -12,10 +13,34 @@ if TYPE_CHECKING: import numpy as np +K = TypeVar("K", bound=Union[H5Array, ZarrArray]) -class CategoricalArray( - xr.backends.zarr.ZarrArrayWrapper -): # Zarr works for hdf5, xarray only supports integration hdf5 in the netcdf context + +class ZarrOrHDF5Wrapper(xr.backends.zarr.ZarrArrayWrapper, Generic[K]): + @singledispatchmethod # type: ignore + def __init__(self, array: ZarrArray): + return super().__init__(array) + + @__init__.register + def _(self, array: H5Array): + self._array = array + self.shape = self._array.shape + self.dtype = self._array.dtype + + def __getitem__(self, key): + if isinstance(self._array, ZarrArray): + return super().__getitem__(key) + # adapted from https://github.com/pydata/xarray/blob/main/xarray/backends/h5netcdf_.py#L50-L58C13 + # TODO: locks? + return xr.core.indexing.explicit_indexing_adapter( + key, + self.shape, + xr.core.indexing.IndexingSupport.OUTER_1VECTOR, + lambda key: self._array[key], + ) + + +class CategoricalArray(xr.backends.BackendArray): def __init__( self, codes: ZarrArray | H5Array, @@ -29,8 +54,8 @@ def __init__( self._ordered = ordered self._drop_unused_cats = drop_unused_cats self._categories_cache = None - self._array = codes - self.shape = self._array.shape + self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) + self.shape = self._codes.shape self.dtype = pd.CategoricalDtype( categories=self._categories, ordered=self._ordered ) @@ -49,7 +74,7 @@ def categories(self): # __slots__ and cached_property are incompatible return self._categories_cache def __getitem__(self, key: xr.core.indexing.ExplicitIndexer) -> np.typing.ArrayLike: - codes = super().__getitem__(key) + codes = self._codes[key] categorical_array = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self._ordered ) @@ -60,29 +85,24 @@ def __getitem__(self, key: xr.core.indexing.ExplicitIndexer) -> np.typing.ArrayL return xr.core.indexing.ExtensionDuckArray(categorical_array) -class MaskedArray(xr.backends.zarr.ZarrArrayWrapper): +class MaskedArray(xr.backends.BackendArray): def __init__( self, values: ZarrArray | H5Array, dtype_str: str, - *args, mask: ZarrArray | H5Array | None = None, - **kwargs, ): - self._mask = mask - self._values = values + self._mask = ZarrOrHDF5Wrapper[type(mask)](mask) + self._values = ZarrOrHDF5Wrapper[type(values)](values) self._dtype_str = dtype_str - self._array = values - self.shape = self._array.shape - self.dtype = pd.api.types.pandas_dtype(self._array.dtype) + self.shape = self._values.shape + self.dtype = pd.api.types.pandas_dtype(self._values.dtype) def __getitem__(self, key): # HACK! TODO(ilan-gold): open issue about hdf5 compat that doesn't allow initialization! - self._array = self._values - values = super().__getitem__(key) + values = self._values[key] if self._mask is not None: - self._array = self._mask - mask = super().__getitem__(key) + mask = self._mask[key] if self._dtype_str == "nullable-integer": # numpy does not support nan ints return xr.core.indexing.ExtensionDuckArray( From d111f046717f85a58544e3181d53f69ab2545796 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:22:43 +0200 Subject: [PATCH 021/348] (feat): `read_elem_lazy` method --- src/anndata/_io/specs/__init__.py | 6 +- src/anndata/_io/specs/lazy_methods.py | 105 +++++++++++++++++++++++ src/anndata/_io/specs/registry.py | 24 ++++-- src/anndata/tests/test_io_elementwise.py | 97 ++++++++++++++++----- 4 files changed, 206 insertions(+), 26 deletions(-) create mode 100644 src/anndata/_io/specs/lazy_methods.py diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index ceff8b3d6..8fd9898a3 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -1,21 +1,25 @@ from __future__ import annotations -from . import methods +from . import lazy_methods, methods from .registry import ( + _LAZY_REGISTRY, # noqa: F401 _REGISTRY, # noqa: F401 IOSpec, Reader, Writer, get_spec, read_elem, + read_elem_lazy, write_elem, ) __all__ = [ "methods", + "lazy_methods", "write_elem", "get_spec", "read_elem", + "read_elem_lazy", "Reader", "Writer", "IOSpec", diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py new file mode 100644 index 000000000..16c5c93f6 --- /dev/null +++ b/src/anndata/_io/specs/lazy_methods.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import dask.array as da +import h5py +import numpy as np +from scipy import sparse + +import anndata as ad +from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup + +from .registry import _LAZY_REGISTRY, IOSpec + +# TODO: settings +stride = 100 +h5_chunks = 1000 + + +def make_dask_array(is_csc, shape, make_dask_chunk, dtype): + chunks = [None, None] + major_index = int(is_csc) + minor_index = (is_csc + 1) % 2 + chunks[minor_index] = (shape[minor_index],) + chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( + shape[major_index] % stride, + ) + memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + da_mtx = da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunks, + meta=memory_format((0, 0), dtype=np.float32), + ) + return da_mtx + + +def make_index(is_csc, stride, shape, block_id): + index = ( + slice( + block_id[is_csc] * stride, + min((block_id[is_csc] * stride) + stride, shape[0]), + ), + ) + if is_csc: + return (slice(None, None, None),) + index + return index + + +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +def read_sparse_as_dask_h5(elem, _reader): + filename = elem.file.filename + elem_name = elem.name + with h5py.File(filename, "r") as f: + e = f[elem_name] + shape = e.attrs["shape"] + encoding_type = e.attrs["encoding-type"] + dtype = e["data"].dtype + is_csc = encoding_type == "csc_matrix" + + def make_dask_chunk(block_id=None): + # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` + # https://github.com/scverse/anndata/issues/1105 + with h5py.File(filename, "r") as f: + mtx = ad.experimental.sparse_dataset(f[elem_name]) + index = make_index(is_csc, stride, shape, block_id) + chunk = mtx[*index] + return chunk + + return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + + +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) +def read_sparse_as_dask_zarr(elem, _reader): + shape = elem.attrs["shape"] + dtype = elem["data"].dtype + is_csc = elem.attrs["encoding-type"] == "csc_matrix" + + def make_dask_chunk(block_id=None): + mtx = ad.experimental.sparse_dataset(elem) + index = make_index(is_csc, stride, shape, block_id) + return mtx[*index] + + return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + + +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) +def read_h5_array(elem, _reader): + if not hasattr(elem, "chunks") or elem.chunks is None: + return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) + return da.from_array(elem) + + +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) +def read_h5_string_array(elem, _reader): + from anndata._io.h5ad import read_dataset + + elem = read_dataset(elem) + return read_h5_array(elem, _reader) + + +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) +def read_zarr_array(elem, _reader): + return da.from_zarr(elem) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a8357295d..b422ff223 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -44,7 +44,7 @@ def _from_read_parts( ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not msg = ( - f"No {method} method registered for {spec} from {src_typ}. " + f"No {method} method registered for {spec} from {src_typ} in registry {registry}. " "You may need to update your installation of anndata." ) return cls(msg) @@ -145,9 +145,7 @@ def get_reader( if (src_type, spec, modifiers) in self.read: return self.read[(src_type, spec, modifiers)] else: - raise IORegistryError._from_read_parts( - "read", _REGISTRY.read, src_type, spec - ) + raise IORegistryError._from_read_parts("read", self.read, src_type, spec) def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() @@ -176,7 +174,7 @@ def get_partial_reader( return self.read_partial[(src_type, spec, modifiers)] else: raise IORegistryError._from_read_parts( - "read_partial", _REGISTRY.read_partial, src_type, spec + "read_partial", self.read_partial, src_type, spec ) def get_spec(self, elem: Any) -> IOSpec: @@ -188,6 +186,7 @@ def get_spec(self, elem: Any) -> IOSpec: _REGISTRY = IORegistry() +_LAZY_REGISTRY = IORegistry() @singledispatch @@ -332,6 +331,21 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) +def read_elem_lazy(elem: StorageType) -> Any: + """ + Read an element from a store lazily. + + Assumes that the element is encoded using the anndata encoding. This function will + determine the encoded type using the encoding metadata stored in elem's attributes. + + Params + ------ + elem + The stored element. + """ + return Reader(_LAZY_REGISTRY).read_elem(elem) + + def write_elem( store: GroupStorageType, k: str, diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index cd43d57ca..aae470c54 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -15,7 +15,14 @@ from scipy import sparse import anndata as ad -from anndata._io.specs import _REGISTRY, IOSpec, get_spec, read_elem, write_elem +from anndata._io.specs import ( + _REGISTRY, + IOSpec, + get_spec, + read_elem, + read_elem_lazy, + write_elem, +) from anndata._io.specs.registry import IORegistryError from anndata.compat import H5Group, ZarrGroup, _read_attr from anndata.tests.helpers import ( @@ -47,6 +54,46 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: file.close() +sparse_formats = ["csr", "csc"] +SIZE = 1000 + + +@pytest.fixture(scope="function", params=sparse_formats) +def sparse_format(request): + return request.param + + +def create_dense_store(store): + X = np.random.randn(SIZE, SIZE) + + write_elem(store, "X", X) + return store + + +def create_string_store(store): + X = np.arange(0, SIZE * SIZE).reshape((SIZE, SIZE)).astype(str) + + write_elem(store, "X", X) + return store + + +def create_sparse_store(sparse_format, store): + import dask.array as da + + X = sparse.random( + SIZE, + SIZE, + format=sparse_format, + density=0.01, + random_state=np.random.default_rng(), + ) + X_dask = da.from_array(X, chunks=(100, 100)) + + write_elem(store, "X", X) + write_elem(store, "X_dask", X_dask) + return store + + @pytest.mark.parametrize( "value,encoding_type", [ @@ -126,30 +173,40 @@ def test_io_spec_cupy(store, value, encoding_type): assert get_spec(store[key]) == _REGISTRY.get_spec(value) -@pytest.mark.parametrize("sparse_format", ["csr", "csc"]) -def test_dask_write_sparse(store, sparse_format): - import dask.array as da +def test_dask_write_sparse(sparse_format, store): + x_sparse_store = create_sparse_store(sparse_format, store) + X_from_disk = read_elem(x_sparse_store["X"]) + X_dask_from_disk = read_elem(x_sparse_store["X_dask"]) - X = sparse.random( - 1000, - 1000, - format=sparse_format, - density=0.01, - random_state=np.random.default_rng(), - ) - X_dask = da.from_array(X, chunks=(100, 100)) + assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(dict(x_sparse_store["X"].attrs), dict(x_sparse_store["X_dask"].attrs)) - write_elem(store, "X", X) - write_elem(store, "X_dask", X_dask) + assert x_sparse_store["X_dask/indptr"].dtype == np.int64 + assert x_sparse_store["X_dask/indices"].dtype == np.int64 - X_from_disk = read_elem(store["X"]) - X_dask_from_disk = read_elem(store["X_dask"]) + +@pytest.mark.parametrize("arr_type", ["dense", "string", *sparse_formats]) +def test_read_lazy_2d_dask(arr_type, store): + if arr_type == "dense": + arr_store = create_dense_store(store) + elif arr_type == "string": + arr_store = create_string_store(store) + else: + arr_store = create_sparse_store(arr_type, store) + X_dask_from_disk = read_elem_lazy(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) - assert_equal(dict(store["X"].attrs), dict(store["X_dask"].attrs)) + random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_bool_mask = np.random.randn(SIZE) > 0 + index_slice = slice(0, SIZE // 10) + for index in [random_int_indices, index_slice, random_bool_mask]: + assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) + assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) - assert store["X_dask/indptr"].dtype == np.int64 - assert store["X_dask/indices"].dtype == np.int64 + if arr_type in {"csr", "csc"}: + assert arr_store["X_dask/indptr"].dtype == np.int64 + assert arr_store["X_dask/indices"].dtype == np.int64 def test_io_spec_raw(store): @@ -178,7 +235,7 @@ def test_write_anndata_to_root(store): ["attribute", "value"], [ ("encoding-type", "floob"), - ("encoding-version", "10000.0"), + ("encoding-version", "SIZE0.0"), ], ) def test_read_iospec_not_found(store, attribute, value): From 00be7f02cb93a2affb65ccd195a219aae8328f4e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:26:34 +0200 Subject: [PATCH 022/348] (revert): error message --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index b422ff223..a2e62db7a 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -44,7 +44,7 @@ def _from_read_parts( ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not msg = ( - f"No {method} method registered for {spec} from {src_typ} in registry {registry}. " + f"No {method} method registered for {spec} from {src_typ}. " "You may need to update your installation of anndata." ) return cls(msg) From fd635d771aa65987ede03042ffe3c29548a6b6bc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:29:24 +0200 Subject: [PATCH 023/348] (refactor): declare `is_csc` reading elem directly in h5 --- src/anndata/_io/specs/lazy_methods.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 16c5c93f6..dcdc5160a 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -53,9 +53,8 @@ def read_sparse_as_dask_h5(elem, _reader): with h5py.File(filename, "r") as f: e = f[elem_name] shape = e.attrs["shape"] - encoding_type = e.attrs["encoding-type"] dtype = e["data"].dtype - is_csc = encoding_type == "csc_matrix" + is_csc = e.attrs["encoding-type"] == "csc_matrix" def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From f5e7fda7049fb33a13dea821cbe2d8f1eb513988 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:25:05 +0200 Subject: [PATCH 024/348] (chore): `read_elem_lazy` -> `read_elem_as_dask` --- src/anndata/_io/specs/__init__.py | 4 ++-- src/anndata/_io/specs/registry.py | 2 +- src/anndata/tests/test_io_elementwise.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index 8fd9898a3..5eadfdb50 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -9,7 +9,7 @@ Writer, get_spec, read_elem, - read_elem_lazy, + read_elem_as_dask, write_elem, ) @@ -19,7 +19,7 @@ "write_elem", "get_spec", "read_elem", - "read_elem_lazy", + "read_elem_as_dask", "Reader", "Writer", "IOSpec", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a2e62db7a..7460d7f70 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -331,7 +331,7 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_lazy(elem: StorageType) -> Any: +def read_elem_as_dask(elem: StorageType) -> Any: """ Read an element from a store lazily. diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index aae470c54..07484e422 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -20,7 +20,7 @@ IOSpec, get_spec, read_elem, - read_elem_lazy, + read_elem_as_dask, write_elem, ) from anndata._io.specs.registry import IORegistryError @@ -193,7 +193,7 @@ def test_read_lazy_2d_dask(arr_type, store): arr_store = create_string_store(store) else: arr_store = create_sparse_store(arr_type, store) - X_dask_from_disk = read_elem_lazy(arr_store["X"]) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From ae5396cfc0fa6ce47463453320dc6cbe45a520e2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:28:57 +0200 Subject: [PATCH 025/348] (chore): remove string handling --- src/anndata/_io/specs/lazy_methods.py | 9 --------- src/anndata/tests/test_io_elementwise.py | 11 +---------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index dcdc5160a..d967d7591 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -90,15 +90,6 @@ def read_h5_array(elem, _reader): return da.from_array(elem) -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_h5_string_array(elem, _reader): - from anndata._io.h5ad import read_dataset - - elem = read_dataset(elem) - return read_h5_array(elem, _reader) - - @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) def read_zarr_array(elem, _reader): return da.from_zarr(elem) diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index 07484e422..2fb49162b 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -70,13 +70,6 @@ def create_dense_store(store): return store -def create_string_store(store): - X = np.arange(0, SIZE * SIZE).reshape((SIZE, SIZE)).astype(str) - - write_elem(store, "X", X) - return store - - def create_sparse_store(sparse_format, store): import dask.array as da @@ -185,12 +178,10 @@ def test_dask_write_sparse(sparse_format, store): assert x_sparse_store["X_dask/indices"].dtype == np.int64 -@pytest.mark.parametrize("arr_type", ["dense", "string", *sparse_formats]) +@pytest.mark.parametrize("arr_type", ["dense", *sparse_formats]) def test_read_lazy_2d_dask(arr_type, store): if arr_type == "dense": arr_store = create_dense_store(store) - elif arr_type == "string": - arr_store = create_string_store(store) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"]) From 664336aa30511f1b28ba55d3d8b3028dbcd74eda Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:36:24 +0200 Subject: [PATCH 026/348] (refactor): use `elem` for h5 where posssble --- src/anndata/_io/specs/lazy_methods.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index d967d7591..2f392db00 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -50,11 +50,9 @@ def make_index(is_csc, stride, shape, block_id): def read_sparse_as_dask_h5(elem, _reader): filename = elem.file.filename elem_name = elem.name - with h5py.File(filename, "r") as f: - e = f[elem_name] - shape = e.attrs["shape"] - dtype = e["data"].dtype - is_csc = e.attrs["encoding-type"] == "csc_matrix" + shape = elem.attrs["shape"] + dtype = elem["data"].dtype + is_csc = elem.attrs["encoding-type"] == "csc_matrix" def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From 52002b6eeee9aa60dc7ccac3956e8c32e497a78a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 12:37:34 +0200 Subject: [PATCH 027/348] (chore): remove invlaud syntax --- src/anndata/_io/specs/lazy_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 2f392db00..567561d03 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -60,7 +60,7 @@ def make_dask_chunk(block_id=None): with h5py.File(filename, "r") as f: mtx = ad.experimental.sparse_dataset(f[elem_name]) index = make_index(is_csc, stride, shape, block_id) - chunk = mtx[*index] + chunk = mtx[index] return chunk return make_dask_array(is_csc, shape, make_dask_chunk, dtype) @@ -76,7 +76,7 @@ def read_sparse_as_dask_zarr(elem, _reader): def make_dask_chunk(block_id=None): mtx = ad.experimental.sparse_dataset(elem) index = make_index(is_csc, stride, shape, block_id) - return mtx[*index] + return mtx[index] return make_dask_array(is_csc, shape, make_dask_chunk, dtype) From aa1006ea0813a0f6e48d378927a303e5184b5c9b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 13:44:03 +0200 Subject: [PATCH 028/348] (fix): put dask import inside function --- src/anndata/_io/specs/lazy_methods.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 567561d03..a4445c77d 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,6 +1,5 @@ from __future__ import annotations -import dask.array as da import h5py import numpy as np from scipy import sparse @@ -16,6 +15,8 @@ def make_dask_array(is_csc, shape, make_dask_chunk, dtype): + import dask.array as da + chunks = [None, None] major_index = int(is_csc) minor_index = (is_csc + 1) % 2 @@ -83,6 +84,8 @@ def make_dask_chunk(block_id=None): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array(elem, _reader): + import dask.array as da + if not hasattr(elem, "chunks") or elem.chunks is None: return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) return da.from_array(elem) @@ -90,4 +93,6 @@ def read_h5_array(elem, _reader): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array(elem, _reader): + import dask.array as da + return da.from_zarr(elem) From dda7d8306f242ccf1f3682151e845cecd3467044 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 16:45:23 +0200 Subject: [PATCH 029/348] (refactor): try maybe open? --- src/anndata/_io/specs/lazy_methods.py | 43 +++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a4445c77d..6931fae0f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,5 +1,8 @@ from __future__ import annotations +from contextlib import contextmanager +from pathlib import Path + import h5py import numpy as np from scipy import sparse @@ -46,11 +49,28 @@ def make_index(is_csc, stride, shape, block_id): return index +@contextmanager +def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): + if isinstance(filename_or_elem, str): + file = h5py.File(filename_or_elem, "r") + try: + yield file[elem_name] + finally: + file.close() + else: + try: + yield filename_or_elem + finally: + pass + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask_h5(elem, _reader): - filename = elem.file.filename - elem_name = elem.name + filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem + elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name shape = elem.attrs["shape"] dtype = elem["data"].dtype is_csc = elem.attrs["encoding-type"] == "csc_matrix" @@ -58,8 +78,8 @@ def read_sparse_as_dask_h5(elem, _reader): def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - with h5py.File(filename, "r") as f: - mtx = ad.experimental.sparse_dataset(f[elem_name]) + with maybe_open_h5(filename_or_elem, elem_name) as f: + mtx = ad.experimental.sparse_dataset(f) index = make_index(is_csc, stride, shape, block_id) chunk = mtx[index] return chunk @@ -67,21 +87,6 @@ def make_dask_chunk(block_id=None): return make_dask_array(is_csc, shape, make_dask_chunk, dtype) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask_zarr(elem, _reader): - shape = elem.attrs["shape"] - dtype = elem["data"].dtype - is_csc = elem.attrs["encoding-type"] == "csc_matrix" - - def make_dask_chunk(block_id=None): - mtx = ad.experimental.sparse_dataset(elem) - index = make_index(is_csc, stride, shape, block_id) - return mtx[index] - - return make_dask_array(is_csc, shape, make_dask_chunk, dtype) - - @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array(elem, _reader): import dask.array as da From 5be365d035bbe4ccdea22c411e6ce4f0ebeded27 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 17 May 2024 14:32:57 +0200 Subject: [PATCH 030/348] (feat): upgrade xarray --- pyproject.toml | 4 +- .../experimental/backed/_lazy_arrays.py | 23 ++-- src/anndata/tests/helpers.py | 10 +- tests/test_read_backed_experimental.py | 120 ------------------ 4 files changed, 16 insertions(+), 141 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d6877b09d..7e7f1a303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,12 +100,14 @@ test = [ "pytest-mock" ] gpu = ["cupy"] -lazy = ["xarray"] +xarray = ["xarray@git+https://github.com/ilan-gold/xarray#egg=ig/fix_equality_checl"] [tool.hatch.version] source = "vcs" [tool.hatch.build.hooks.vcs] version-file = "src/anndata/_version.py" +[tool.hatch.metadata] +allow-direct-references = true [tool.coverage.run] source_pkgs = ["anndata"] diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 5fad71deb..154d610fc 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import singledispatchmethod -from typing import TYPE_CHECKING, Generic, TypeVar, Union +from typing import Generic, TypeVar, Union import pandas as pd import xarray as xr @@ -10,9 +10,6 @@ from anndata._core.views import as_view from anndata.compat import H5Array, ZarrArray -if TYPE_CHECKING: - import numpy as np - K = TypeVar("K", bound=Union[H5Array, ZarrArray]) @@ -46,7 +43,7 @@ def __init__( codes: ZarrArray | H5Array, categories: ZarrArray | H5Array, ordered: bool, - drop_unused_cats: bool, + drop_unused_cats: bool = False, *args, **kwargs, ): @@ -73,16 +70,18 @@ def categories(self): # __slots__ and cached_property are incompatible self._categories_cache = read_dataset(self._categories) return self._categories_cache - def __getitem__(self, key: xr.core.indexing.ExplicitIndexer) -> np.typing.ArrayLike: + def __getitem__( + self, key: xr.core.indexing.ExplicitIndexer + ) -> xr.core.extension_array.PandasExtensionArray: codes = self._codes[key] categorical_array = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self._ordered ) if self._drop_unused_cats: - return xr.core.indexing.ExtensionDuckArray( + return xr.core.extension_array.PandasExtensionArray( categorical_array.remove_unused_categories() ) - return xr.core.indexing.ExtensionDuckArray(categorical_array) + return xr.core.extension_array.PandasExtensionArray(categorical_array) class MaskedArray(xr.backends.BackendArray): @@ -98,21 +97,21 @@ def __init__( self.shape = self._values.shape self.dtype = pd.api.types.pandas_dtype(self._values.dtype) - def __getitem__(self, key): + def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: # HACK! TODO(ilan-gold): open issue about hdf5 compat that doesn't allow initialization! values = self._values[key] if self._mask is not None: mask = self._mask[key] if self._dtype_str == "nullable-integer": # numpy does not support nan ints - return xr.core.indexing.ExtensionDuckArray( + return xr.core.extension_array.PandasExtensionArray( pd.arrays.IntegerArray(values, mask=mask) ) elif self._dtype_str == "nullable-boolean": - return xr.core.indexing.ExtensionDuckArray( + return xr.core.extension_array.PandasExtensionArray( pd.arrays.BooleanArray(values, mask=mask) ) - return xr.core.indexing.ExtensionDuckArray(pd.array(values)) + return xr.core.extension_array.PandasExtensionArray(pd.array(values)) @_subset.register(xr.DataArray) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index edbc5eccf..e0a09f6b0 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -859,9 +859,9 @@ def shares_memory_sparse(x, y): class AccessTrackingStore(zarr.DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._access_count = {} + self._access_count = defaultdict(int) self._accessed = defaultdict(set) - self._accessed_keys = {} + self._accessed_keys = defaultdict(list) def __getitem__(self, key): for tracked in self._access_count: @@ -874,12 +874,6 @@ def __getitem__(self, key): def get_access_count(self, key): return self._access_count[key] - def set_key_trackers(self, keys_to_track): - if isinstance(keys_to_track, str): - keys_to_track = [keys_to_track] - for k in keys_to_track: - self._access_count[k] = 0 - def get_subkeys_accessed(self, key): return self._accessed[key] diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index c9dd505e9..7633ec859 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -187,7 +187,6 @@ def test_access_count_obs_var(tmp_path, mtx_format): ) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) - store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes", "X"]) remote = read_backed(store) # a series of methods that should __not__ read in any data remote.X # the initial (non-subset) access to `X` should not read in data @@ -275,122 +274,3 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): subsetted_subsetted_adata, remote[:, subset_var][:, subset_subset_var].to_memory(), ) - - -def test_lazy_categorical_array_properties(categorical_lazy_arr): - assert len(categorical_lazy_arr[0:3]) == 3 - assert type(categorical_lazy_arr[0:3]) == pd.Categorical - assert len(categorical_lazy_arr[...]) == len(categorical_lazy_arr) - assert type(categorical_lazy_arr[...]) == pd.Categorical - - -def test_lazy_categorical_array_equality(categorical_lazy_arr): - assert (categorical_lazy_arr[0] == "foo").all() - assert (categorical_lazy_arr[3:5] == "bar").all() - assert (categorical_lazy_arr == "foo").any() - - -def test_lazy_categorical_array_subset_subset(categorical_lazy_arr): - subset_susbet = categorical_lazy_arr[0:10][5:10] - assert len(subset_susbet) == 5 - assert type(subset_susbet) == pd.Categorical - assert ( - subset_susbet[...] - == pd.Categorical.from_codes( - codes=[2, 2, 1, 2, 0], - categories=["foo", "bar", "jazz"], - ordered=False, - ).remove_unused_categories() - ).all() - - -def test_nullable_boolean_array_properties(nullable_boolean_lazy_arr): - assert len(nullable_boolean_lazy_arr[0:3]) == 3 - assert type(nullable_boolean_lazy_arr[0:3]) == pd.arrays.BooleanArray - assert len(nullable_boolean_lazy_arr[...]) == len(nullable_boolean_lazy_arr) - assert type(nullable_boolean_lazy_arr[...]) == pd.arrays.BooleanArray - - -def test_nullable_boolean_array_equality(nullable_boolean_lazy_arr): - assert (nullable_boolean_lazy_arr[0] == pd.NA).all() - assert (nullable_boolean_lazy_arr[3:5] == pd.NA).all() - assert (nullable_boolean_lazy_arr[5:7] == np.array([True, False])).all() - - -def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): - subset_susbet = nullable_boolean_lazy_arr[0:10][5:10] - assert len(subset_susbet) == 5 - assert type(subset_susbet) == pd.arrays.BooleanArray - assert ( - subset_susbet[...] - == pd.arrays.BooleanArray( - values=np.array([True, False, False, True, True]), - mask=np.array([False, False, True, False, True]), - ) - ).all() - - -def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] == True # noqa - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() # noqa - assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() - - -def test_nullable_boolean_array_no_mask_subset_subset( - nullable_boolean_lazy_arr_no_mask, -): - subset_susbet = nullable_boolean_lazy_arr_no_mask[0:10][5:10] - assert len(subset_susbet) == 5 - assert type(subset_susbet) == pd.arrays.BooleanArray - assert ( - subset_susbet[...] - == pd.array( - np.array([True, False, False, True, True]), - ) - ).all() - - -def test_nullable_integer_array_properties(nullable_integer_lazy_arr): - assert len(nullable_integer_lazy_arr[0:3]) == 3 - assert type(nullable_integer_lazy_arr[0:3]) == pd.arrays.IntegerArray - assert len(nullable_integer_lazy_arr[...]) == len(nullable_integer_lazy_arr) - assert type(nullable_integer_lazy_arr[...]) == pd.arrays.IntegerArray - - -def test_nullable_integer_array_equality(nullable_integer_lazy_arr): - assert (nullable_integer_lazy_arr[0] == pd.NA).all() - assert (nullable_integer_lazy_arr[3:5] == pd.NA).all() - assert (nullable_integer_lazy_arr[5:7] == np.array([2, 2])).all() - - -def test_nullable_integer_array_subset_subset(nullable_integer_lazy_arr): - subset_susbet = nullable_integer_lazy_arr[0:10][5:10] - assert len(subset_susbet) == 5 - assert type(subset_susbet) == pd.arrays.IntegerArray - assert ( - subset_susbet[...] - == pd.arrays.IntegerArray( - values=np.array([2, 2, 1, 2, 0]), - mask=np.array([False, False, True, False, True]), - ) - ).all() - - -def test_nullable_integer_array_no_mask_equality(nullable_integer_lazy_arr_no_mask): - assert (nullable_integer_lazy_arr_no_mask[0] == pd.NA).all() - assert (nullable_integer_lazy_arr_no_mask[3:5] == 1).all() - assert (nullable_integer_lazy_arr_no_mask[5:7] == np.array([2, 2])).all() - - -def test_nullable_integer_array_no_mask_subset_subset( - nullable_integer_lazy_arr_no_mask, -): - subset_susbet = nullable_integer_lazy_arr_no_mask[0:10][5:10] - assert len(subset_susbet) == 5 - assert type(subset_susbet) == pd.arrays.IntegerArray - assert ( - subset_susbet[...] - == pd.array( - np.array([2, 2, 1, 2, 0]), - ) - ).all() From 1fc4cc354bab82d075d6f23a85f40831b9bf6e99 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:00:15 +0200 Subject: [PATCH 031/348] (fix): revert `encoding-version` --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 09872994c..d3ddcc5a9 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -243,7 +243,7 @@ def test_write_anndata_to_root(store): ["attribute", "value"], [ ("encoding-type", "floob"), - ("encoding-version", "SIZE0.0"), + ("encoding-version", "10000.0"), ], ) def test_read_iospec_not_found(store, attribute, value): From 5ca71eaaa62a4eb75780981c93fb2643ae2bf416 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:00:56 +0200 Subject: [PATCH 032/348] (chore): document `create_sparse_store` test function --- tests/test_io_elementwise.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index d3ddcc5a9..12d5be98f 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -5,6 +5,7 @@ from __future__ import annotations import re +from typing import Literal, TypeVar import h5py import numpy as np @@ -70,7 +71,21 @@ def create_dense_store(store): return store -def create_sparse_store(sparse_format, store): +G = TypeVar("G", bound=H5Group | ZarrGroup) + + +def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: + """Returns a store + + Parameters + ---------- + sparse_format + store + + Returns + ------- + A store with a key, `X` that is simply a sparse matrix, and `X_dask` where that same array is wrapped by dask + """ import dask.array as da X = sparse.random( @@ -80,7 +95,9 @@ def create_sparse_store(sparse_format, store): density=0.01, random_state=np.random.default_rng(), ) - X_dask = da.from_array(X, chunks=(100, 100)) + X_dask = da.from_array( + X, chunks=(100 if format == "csr" else SIZE, SIZE if format == "csr" else 100) + ) write_elem(store, "X", X) write_elem(store, "X_dask", X_dask) From 3672c187a539af72bed3bab1b6a1858ee5ef787e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:11:30 +0200 Subject: [PATCH 033/348] (chore): sort indices to prevent warning --- tests/test_io_elementwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 12d5be98f..da34f621c 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -206,6 +206,7 @@ def test_read_lazy_2d_dask(arr_type, store): assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_int_indices.sort() random_bool_mask = np.random.randn(SIZE) > 0 index_slice = slice(0, SIZE // 10) for index in [random_int_indices, index_slice, random_bool_mask]: From 33c35998e1fc9cb61992c707942b503b30a3d8da Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:12:09 +0200 Subject: [PATCH 034/348] (fix): remove utility function `make_dask_array` --- src/anndata/_io/specs/lazy_methods.py | 38 ++++++++++++--------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6931fae0f..42d01def1 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -17,26 +17,6 @@ h5_chunks = 1000 -def make_dask_array(is_csc, shape, make_dask_chunk, dtype): - import dask.array as da - - chunks = [None, None] - major_index = int(is_csc) - minor_index = (is_csc + 1) % 2 - chunks[minor_index] = (shape[minor_index],) - chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( - shape[major_index] % stride, - ) - memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] - da_mtx = da.map_blocks( - make_dask_chunk, - dtype=dtype, - chunks=chunks, - meta=memory_format((0, 0), dtype=np.float32), - ) - return da_mtx - - def make_index(is_csc, stride, shape, block_id): index = ( slice( @@ -69,6 +49,8 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask_h5(elem, _reader): + import dask.array as da + filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name shape = elem.attrs["shape"] @@ -84,7 +66,21 @@ def make_dask_chunk(block_id=None): chunk = mtx[index] return chunk - return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + chunks = [None, None] + major_index = int(is_csc) + minor_index = (is_csc + 1) % 2 + chunks[minor_index] = (shape[minor_index],) + chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( + shape[major_index] % stride, + ) + memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + da_mtx = da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunks, + meta=memory_format((0, 0), dtype=np.float32), + ) + return da_mtx @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) From 157e7103e4c23304d22da78b595814a91419af57 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:13:01 +0200 Subject: [PATCH 035/348] (chore): `read_sparse_as_dask_h5` -> `read_sparse_as_dask` --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 42d01def1..376257759 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -48,7 +48,7 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask_h5(elem, _reader): +def read_sparse_as_dask(elem, _reader): import dask.array as da filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem From 375000d2cb6ad2cafcddd1938ad56a50202a432c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:28:36 +0200 Subject: [PATCH 036/348] (feat): make params of `h5_chunks` and `stride` --- src/anndata/_io/specs/lazy_methods.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 376257759..579b9f741 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -12,10 +12,6 @@ from .registry import _LAZY_REGISTRY, IOSpec -# TODO: settings -stride = 100 -h5_chunks = 1000 - def make_index(is_csc, stride, shape, block_id): index = ( @@ -48,7 +44,7 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem, _reader): +def read_sparse_as_dask(elem, _reader, stride: int = 100): import dask.array as da filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem @@ -84,11 +80,11 @@ def make_dask_chunk(block_id=None): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader): +def read_h5_array(elem, _reader, chunk_size: int = 1000): import dask.array as da if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) + return da.from_array(elem, chunks=(chunk_size,) * len(elem.shape)) return da.from_array(elem) From 241904a6860b311f64c69321137cda43e76c76f0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:49:45 +0200 Subject: [PATCH 037/348] (chore): add distributed test --- tests/test_io_elementwise.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index da34f621c..fc773866e 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -218,6 +218,20 @@ def test_read_lazy_2d_dask(arr_type, store): assert arr_store["X_dask/indices"].dtype == np.int64 +def test_read_lazy_h5_cluster(sparse_format, tmp_path): + import dask.distributed as dd + + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + arr_store = create_sparse_store(sparse_format, store) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) + file.close() + with dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with dd.Client(cluster) as client: # noqa: F841 + assert_equal(X_from_disk, X_dask_from_disk) + + @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) def test_write_indptr_dtype_override(store, sparse_format): X = sparse.random( From 42d0d2212c77b6b88c787b4d2b18db382a3a9eb0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:51:04 +0200 Subject: [PATCH 038/348] (fix): `TypeVar` bind --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index fc773866e..f71d32117 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -71,7 +71,7 @@ def create_dense_store(store): return store -G = TypeVar("G", bound=H5Group | ZarrGroup) +G = TypeVar("G", H5Group, ZarrGroup) def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: From 0bba2c062c653bcd2a565c379e8ba8af44f98096 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 4 Jun 2024 10:28:43 +0200 Subject: [PATCH 039/348] (chore): release note --- docs/release-notes/0.10.8.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index 52b743866..dbcd646a0 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -13,3 +13,5 @@ ```{rubric} Performance ``` + +* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` From 0d0b43a3617af616a67a4d716b492055daa15de5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 5 Jun 2024 13:26:13 +0200 Subject: [PATCH 040/348] (chore): `0.10.8` -> `0.11.0` --- docs/release-notes/0.10.8.md | 2 -- docs/release-notes/0.11.0.md | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index dbcd646a0..52b743866 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -13,5 +13,3 @@ ```{rubric} Performance ``` - -* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index f19202871..3a883f415 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -7,6 +7,7 @@ * Add `should_remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold` * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` * Add `should_check_uniqueness` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1507` {user}`ilan-gold` +* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` ```{rubric} Bugfix ``` From c935fe02dcd938a17af166e23d19cf04b6389963 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 26 Jun 2024 10:54:30 +0200 Subject: [PATCH 041/348] (fix): `ruff` for default `pytest.fixture` `scope` --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index f447e1fb4..30d728a29 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -59,7 +59,7 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: SIZE = 1000 -@pytest.fixture(scope="function", params=sparse_formats) +@pytest.fixture(params=sparse_formats) def sparse_format(request): return request.param From 23e0ea2f082225051cc1b0c7588b72b4179ab4a5 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 1 Jul 2024 17:17:30 +0200 Subject: [PATCH 042/348] Apply suggestions from code review Co-authored-by: Philipp A. --- src/anndata/_io/specs/lazy_methods.py | 46 +++++++++++---------------- tests/test_io_elementwise.py | 6 ++-- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 579b9f741..995099c79 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -14,30 +14,25 @@ def make_index(is_csc, stride, shape, block_id): - index = ( - slice( - block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[0]), - ), + index1d = slice( + block_id[is_csc] * stride, + min((block_id[is_csc] * stride) + stride, shape[0]), ) if is_csc: - return (slice(None, None, None),) + index - return index + return (slice(None, None, None), index1d) + return (index1d,) @contextmanager -def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): - if isinstance(filename_or_elem, str): - file = h5py.File(filename_or_elem, "r") - try: - yield file[elem_name] - finally: - file.close() - else: - try: - yield filename_or_elem - finally: - pass +def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): + if not isinstance(path_or_group, Path): + yield path_or_group + return + file = h5py.File(path_or_group, "r") + try: + yield file[elem_name] + finally: + file.close() @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @@ -62,14 +57,11 @@ def make_dask_chunk(block_id=None): chunk = mtx[index] return chunk - chunks = [None, None] - major_index = int(is_csc) - minor_index = (is_csc + 1) % 2 - chunks[minor_index] = (shape[minor_index],) - chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( - shape[major_index] % stride, - ) - memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + n_strides, rest = np.divmod(shape[major_index], stride) + chunks_major = (stride,) * n_strides + (rest,) + chunks_minor = (shape[minor_index],) + chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( make_dask_chunk, dtype=dtype, diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 30d728a29..732c3b641 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -247,8 +247,10 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) file.close() - with dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster: - with dd.Client(cluster) as client: # noqa: F841 + with ( + dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, + dd.Client(cluster) as _client, + ): assert_equal(X_from_disk, X_dask_from_disk) From 5b96c771d41a23f33af7e38f2a0f790cb371be2d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Jul 2024 17:27:39 +0200 Subject: [PATCH 043/348] (fix): `Any` to `DaskArray` --- src/anndata/_io/specs/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 7460d7f70..21f52beac 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata.compat import _read_attr +from anndata.compat import DaskArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable @@ -331,7 +331,7 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_as_dask(elem: StorageType) -> Any: +def read_elem_as_dask(elem: StorageType) -> DaskArray: """ Read an element from a store lazily. From 0907a4ea086209af7394502d60b96e7d75738c90 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Jul 2024 17:30:58 +0200 Subject: [PATCH 044/348] (fix): type `make_index` + fix undeclared --- src/anndata/_io/specs/lazy_methods.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 995099c79..cc860aaca 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -2,6 +2,7 @@ from contextlib import contextmanager from pathlib import Path +from typing import Literal, overload import h5py import numpy as np @@ -13,7 +14,19 @@ from .registry import _LAZY_REGISTRY, IOSpec -def make_index(is_csc, stride, shape, block_id): +@overload +def make_index( + *, is_csc: Literal[True], stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice, slice]: ... +@overload +def make_index( + *, is_csc: Literal[False], stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice]: ... + + +def make_index( + *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, min((block_id[is_csc] * stride) + stride, shape[0]), @@ -47,6 +60,8 @@ def read_sparse_as_dask(elem, _reader, stride: int = 100): shape = elem.attrs["shape"] dtype = elem["data"].dtype is_csc = elem.attrs["encoding-type"] == "csc_matrix" + major_index = int(is_csc) + minor_index = int(not is_csc) def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From 20ced167d07851d24acd24bca0a5aa03b7abcd7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 15:31:23 +0000 Subject: [PATCH 045/348] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 732c3b641..204b4734f 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -251,7 +251,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, dd.Client(cluster) as _client, ): - assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(X_from_disk, X_dask_from_disk) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From bb6607e8263c1c4560b2e4ca92a09d4352d3d9c4 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 1 Jul 2024 18:02:20 +0200 Subject: [PATCH 046/348] fix rest --- src/anndata/_io/specs/lazy_methods.py | 43 +++++++++++++++++---------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index cc860aaca..e1131b7e7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Literal, overload import h5py @@ -16,16 +16,24 @@ @overload def make_index( - *, is_csc: Literal[True], stride: int, shape: tuple[int, int], block_id: int + *, + is_csc: Literal[True], + stride: int, + shape: tuple[int, int], + block_id: tuple[int, int], ) -> tuple[slice, slice]: ... @overload def make_index( - *, is_csc: Literal[False], stride: int, shape: tuple[int, int], block_id: int + *, + is_csc: Literal[False], + stride: int, + shape: tuple[int, int], + block_id: tuple[int, int], ) -> tuple[slice]: ... def make_index( - *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: int + *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, @@ -52,29 +60,32 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem, _reader, stride: int = 100): +def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): import dask.array as da - filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem - elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name - shape = elem.attrs["shape"] + path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem + elem_name = ( + elem.name if isinstance(elem, H5Group) else PurePosixPath(elem.path).name + ) + shape: tuple[int, int] = elem.attrs["shape"] dtype = elem["data"].dtype - is_csc = elem.attrs["encoding-type"] == "csc_matrix" - major_index = int(is_csc) - minor_index = int(not is_csc) + is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" - def make_dask_chunk(block_id=None): + def make_dask_chunk(block_id: tuple[int, int]): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - with maybe_open_h5(filename_or_elem, elem_name) as f: + with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_index(is_csc, stride, shape, block_id) + index = make_index( + is_csc=is_csc, stride=stride, shape=shape, block_id=block_id + ) chunk = mtx[index] return chunk - n_strides, rest = np.divmod(shape[major_index], stride) + shape_minor, shape_major = shape if is_csc else shape[::-1] + n_strides, rest = np.divmod(shape_major, stride) chunks_major = (stride,) * n_strides + (rest,) - chunks_minor = (shape[minor_index],) + chunks_minor = (shape_minor,) chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( From 419691ba363d3d8028dc7f1c018a46d28c84c211 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 10:17:46 +0200 Subject: [PATCH 047/348] (fix): use `chunks` kwarg --- src/anndata/_io/specs/lazy_methods.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e1131b7e7..0f8bbbf58 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -98,11 +98,13 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader, chunk_size: int = 1000): +def read_h5_array(elem, _reader, chunks: tuple[int] | None = None): import dask.array as da if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(chunk_size,) * len(elem.shape)) + if chunks is None: + chunks = (1000,) * len(elem.shape) + return da.from_array(elem, chunks=chunks) return da.from_array(elem) From fd2376afbefa7d69eb5bee7ed74466754560a3a7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 15:58:16 +0200 Subject: [PATCH 048/348] (feat): expose `chunks` as an option to `read_elem_as_dask` via `dataset_kwargs` --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 80 +++++++++++++++++++++++---- src/anndata/_io/specs/methods.py | 34 ++++++------ src/anndata/_io/specs/registry.py | 40 +++++++++++--- src/anndata/_io/zarr.py | 2 +- src/anndata/experimental/merge.py | 2 +- src/anndata/tests/helpers.py | 2 +- tests/test_backed_sparse.py | 2 +- tests/test_io_dispatched.py | 8 +-- tests/test_io_elementwise.py | 76 +++++++++++++++++++++---- 10 files changed, 192 insertions(+), 56 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 098c139de..d3a9ef028 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -236,7 +236,7 @@ def read_h5ad( with h5py.File(filename, "r") as f: - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 0f8bbbf58..6fbcb48ac 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -2,7 +2,8 @@ from contextlib import contextmanager from pathlib import Path, PurePosixPath -from typing import Literal, overload +from types import MappingProxyType +from typing import TYPE_CHECKING, Any, Literal, overload import h5py import numpy as np @@ -13,6 +14,9 @@ from .registry import _LAZY_REGISTRY, IOSpec +if TYPE_CHECKING: + from collections.abc import Mapping + @overload def make_index( @@ -56,11 +60,18 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): file.close() +_DEFAULT_STRIDE = 1000 + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): +def read_sparse_as_dask( + elem: H5Group | ZarrGroup, + _reader, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): import dask.array as da path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem @@ -71,6 +82,15 @@ def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" + chunks = dataset_kwargs.get("chunks", None) + stride: int = _DEFAULT_STRIDE + if chunks is not None: + if len(chunks) != 2: + raise ValueError("`chunks` must be a tuple of two integers") + if chunks[int(not is_csc)] != shape[int(not is_csc)]: + raise ValueError("Only the major axis can be chunked") + stride = chunks[int(is_csc)] + def make_dask_chunk(block_id: tuple[int, int]): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 @@ -84,32 +104,68 @@ def make_dask_chunk(block_id: tuple[int, int]): shape_minor, shape_major = shape if is_csc else shape[::-1] n_strides, rest = np.divmod(shape_major, stride) - chunks_major = (stride,) * n_strides + (rest,) + chunks_major = (stride,) * n_strides + if rest > 0: + chunks_major += (rest,) chunks_minor = (shape_minor,) - chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + chunk_layout = ( + (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( make_dask_chunk, dtype=dtype, - chunks=chunks, + chunks=chunk_layout, meta=memory_format((0, 0), dtype=np.float32), ) return da_mtx @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader, chunks: tuple[int] | None = None): +def read_h5_array( + elem, + _reader, + chunks: tuple[int] | None = None, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): import dask.array as da - if not hasattr(elem, "chunks") or elem.chunks is None: - if chunks is None: - chunks = (1000,) * len(elem.shape) - return da.from_array(elem, chunks=chunks) - return da.from_array(elem) + path = Path(elem.file.filename) + elem_name = elem.name + shape = elem.shape + dtype = elem.dtype + chunks: tuple[int, ...] = dataset_kwargs.get( + "chunks", (_DEFAULT_STRIDE,) * len(shape) + ) + + def make_dask_chunk(block_id: tuple[int, int]): + with maybe_open_h5(path, elem_name) as f: + idx = () + for i in range(len(shape)): + start = block_id[i] * chunks[i] + stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) + idx += (slice(start, stop),) + return f[*idx] + + chunk_layout = () + for i in range(len(shape)): + n_strides, rest = np.divmod(shape[i], chunks[i]) + chunk = (chunks[i],) * n_strides + if rest > 0: + chunk += (rest,) + chunk_layout += (chunk,) + + return da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunk_layout, + ) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) -def read_zarr_array(elem, _reader): +def read_zarr_array( + elem, _reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}) +): import dask.array as da return da.from_zarr(elem) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 85bf6dddc..acea99bbf 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -109,7 +109,7 @@ def wrapper( @_REGISTRY.register_read(H5File, IOSpec("", "")) @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) -def read_basic(elem, _reader): +def read_basic(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata._io import h5ad warn( @@ -129,7 +129,7 @@ def read_basic(elem, _reader): @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) -def read_basic_zarr(elem, _reader): +def read_basic_zarr(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata._io import zarr warn( @@ -265,7 +265,7 @@ def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem, _reader): +def read_anndata(elem, _reader, dataset_kwargs=MappingProxyType({})): d = {} for k in [ "X", @@ -300,7 +300,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem, _reader): +def read_mapping(elem, _reader, dataset_kwargs=MappingProxyType({})): return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -374,7 +374,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem, _reader): +def read_array(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem[()] @@ -391,7 +391,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d, _reader): +def read_string_array(d, _reader, dataset_kwargs=MappingProxyType({})): return read_array(d.asstr(), _reader=_reader) @@ -460,7 +460,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader): +def read_recarray(d, _reader, dataset_kwargs=MappingProxyType({})): value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -620,7 +620,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem, _reader): +def read_sparse(elem, _reader, dataset_kwargs=MappingProxyType({})): return sparse_dataset(elem).to_memory() @@ -658,7 +658,7 @@ def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem, _reader): +def read_awkward(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -720,7 +720,7 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem, _reader): +def read_dataframe(elem, _reader, dataset_kwargs=MappingProxyType({})): columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -761,7 +761,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem, _reader): +def read_dataframe_0_1_0(elem, _reader, dataset_kwargs=MappingProxyType({})): columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -825,7 +825,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): +def read_categorical(elem, _reader, dataset_kwargs=MappingProxyType({})): return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -869,7 +869,7 @@ def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer(elem, _reader): +def read_nullable_integer(elem, _reader, dataset_kwargs=MappingProxyType({})): if "mask" in elem: return pd.arrays.IntegerArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -880,7 +880,7 @@ def read_nullable_integer(elem, _reader): @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean(elem, _reader): +def read_nullable_boolean(elem, _reader, dataset_kwargs=MappingProxyType({})): if "mask" in elem: return pd.arrays.BooleanArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -896,7 +896,7 @@ def read_nullable_boolean(elem, _reader): @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem, _reader): +def read_scalar(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem[()] @@ -929,12 +929,12 @@ def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem, _reader): +def read_hdf5_string(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem, _reader): +def read_zarr_string(elem, _reader, dataset_kwargs=MappingProxyType({})): return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 21f52beac..6bf2a1964 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -1,10 +1,12 @@ from __future__ import annotations +import inspect +import warnings from collections.abc import Mapping from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypedDict from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import DaskArray, _read_attr @@ -241,6 +243,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> Any: """Read an element from a store. See exported function for more details.""" from functools import partial @@ -251,8 +254,16 @@ def read_elem( _reader=self, ) if self.callback is None: - return read_func(elem) - return self.callback(read_func, elem.name, elem, iospec=iospec) + return read_func(elem, dataset_kwargs=dataset_kwargs) + if "dataset_kwargs" not in inspect.getfullargspec(self.callback)[0]: + warnings.warn( + "Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + stacklevel=2, + ) + return self.callback(read_func, elem.name, elem, iospec=iospec) + return self.callback( + read_func, elem.name, elem, dataset_kwargs=dataset_kwargs, iospec=iospec + ) class Writer: @@ -331,19 +342,34 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_as_dask(elem: StorageType) -> DaskArray: +class DaskKwargs(TypedDict): + chunks: tuple[int, ...] + + +def read_elem_as_dask( + elem: StorageType, dataset_kwargs: DaskKwargs | None = None +) -> DaskArray: """ Read an element from a store lazily. Assumes that the element is encoded using the anndata encoding. This function will determine the encoded type using the encoding metadata stored in elem's attributes. - Params - ------ + + Parameters + ---------- elem The stored element. + dataset_kwargs, optional + Keyword arguments for dask array creation. Only `chunks` is supported with `n` elements, the same `n` as the size of the array. + + Returns + ------- + DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem(elem) + return Reader(_LAZY_REGISTRY).read_elem( + elem, dataset_kwargs=dataset_kwargs if dataset_kwargs is not None else {} + ) def write_elem( diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 0e015244a..9d6f759ff 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -66,7 +66,7 @@ def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: f = zarr.open(store, mode="r") # Read with handling for backwards compat - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index aa6f47e9b..f998d6c79 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -130,7 +130,7 @@ def read_as_backed(group: ZarrGroup | H5Group): BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type in SPARSE_MATRIX: return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index d4b9a38be..91d8cdbcd 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -494,7 +494,7 @@ def assert_equal_cupy(a, b, exact=False, elem_name=None): def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): - assert a.shape == b.shape, format_msg(elem_name) + assert a.shape == b.shape, (a.shape, b.shape) np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name)) elif ( # Structured dtype not exact diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index cc0468230..7538cc121 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -64,7 +64,7 @@ def read_zarr_backed(path): f = zarr.open(path, mode="r") # Read with handling for backwards compat - def callback(func, elem_name, elem, iospec): + def callback(func, elem_name, elem, iospec, dataset_kwargs): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{k: read_dispatched(v, callback) for k, v in elem.items()} diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 833b23e83..c091fa8ac 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -18,7 +18,7 @@ def test_read_dispatched_w_regex(): - def read_only_axis_dfs(func, elem_name: str, elem, iospec): + def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): if iospec.encoding_type == "anndata": return func(elem) elif re.match(r"^/((obs)|(var))?(/.*)?$", elem_name): @@ -40,7 +40,7 @@ def read_only_axis_dfs(func, elem_name: str, elem, iospec): def test_read_dispatched_dask(): import dask.array as da - def read_as_dask_array(func, elem_name: str, elem, iospec): + def read_as_dask_array(func, elem_name: str, elem, iospec, dataset_kwargs): if iospec.encoding_type in { "dataframe", "csr_matrix", @@ -162,11 +162,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, iospec): + def h5ad_reader(func, elem_name: str, elem, iospec, dataset_kwargs): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, iospec): + def zarr_reader(func, elem_name: str, elem, iospec, dataset_kwargs): zarr_read_keys.append(elem_name) return func(elem) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 204b4734f..5927536a4 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -64,8 +64,8 @@ def sparse_format(request): return request.param -def create_dense_store(store): - X = np.random.randn(SIZE, SIZE) +def create_dense_store(store, n_dims: int = 2): + X = np.random.randn(*((SIZE,) * n_dims)) write_elem(store, "X", X) return store @@ -215,12 +215,8 @@ def test_dask_write_sparse(sparse_format, store): assert x_sparse_store["X_dask/indices"].dtype == np.int64 -@pytest.mark.parametrize("arr_type", ["dense", *sparse_formats]) -def test_read_lazy_2d_dask(arr_type, store): - if arr_type == "dense": - arr_store = create_dense_store(store) - else: - arr_store = create_sparse_store(arr_type, store) +def test_read_lazy_2d_dask(sparse_format, store): + arr_store = create_sparse_store(sparse_format, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) @@ -233,9 +229,28 @@ def test_read_lazy_2d_dask(arr_type, store): assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) - if arr_type in {"csr", "csc"}: - assert arr_store["X_dask/indptr"].dtype == np.int64 - assert arr_store["X_dask/indices"].dtype == np.int64 + assert arr_store["X_dask/indptr"].dtype == np.int64 + assert arr_store["X_dask/indices"].dtype == np.int64 + + +@pytest.mark.parametrize( + ("n_dims", "chunks"), + [(1, (100,)), (1, (400,)), (2, (100, 100)), (2, (400, 400)), (2, (200, 400))], +) +def test_read_lazy_nd_dask(store, n_dims, chunks): + arr_store = create_dense_store(store, n_dims) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + X_from_disk = read_elem(arr_store["X"]) + assert_equal(X_from_disk, X_dask_from_disk) + + random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_int_indices.sort() + random_bool_mask = np.random.randn(SIZE) > 0 + index_slice = slice(0, SIZE // 10) + for index in [random_int_indices, index_slice, random_bool_mask]: + assert_equal(X_from_disk[index], X_dask_from_disk[index]) def test_read_lazy_h5_cluster(sparse_format, tmp_path): @@ -254,6 +269,45 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): assert_equal(X_from_disk, X_dask_from_disk) +@pytest.mark.parametrize( + ("arr_type", "chunks"), + [("dense", (100, 100)), ("csc", (SIZE, 10)), ("csr", (10, SIZE))], +) +def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): + import dask.distributed as dd + + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + if arr_type == "dense": + arr_store = create_dense_store(store) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + else: + arr_store = create_sparse_store(arr_type, store) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + X_from_disk = read_elem(arr_store["X"]) + file.close() + with ( + dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, + dd.Client(cluster) as _client, + ): + assert_equal(X_from_disk, X_dask_from_disk) + + +def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): + arr_type = "csr" + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + arr_store = create_sparse_store(arr_type, store) + with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): + read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE,))) + with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): + read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE, 10))) + + @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) def test_write_indptr_dtype_override(store, sparse_format): X = sparse.random( From 42b10938d77a061bff98c50a5efc76c6192f9c9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:50:14 +0200 Subject: [PATCH 049/348] (fix): `test_read_dispatched_null_case` test --- tests/test_io_dispatched.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index c091fa8ac..5dbb1229c 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,7 +76,7 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - actual = read_dispatched(z, lambda _, __, x, **___: read_elem(x)) + actual = read_dispatched(z, lambda _, __, x, ___, ____: read_elem(x)) assert_equal(expected, actual) From 78de057b3ab6786531b11f66b4bd0963485310cf Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:51:07 +0200 Subject: [PATCH 050/348] (fix): disallowed spread syntax? --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6fbcb48ac..b977cd5ef 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,7 +145,7 @@ def make_dask_chunk(block_id: tuple[int, int]): start = block_id[i] * chunks[i] stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) idx += (slice(start, stop),) - return f[*idx] + return f[idx] chunk_layout = () for i in range(len(shape)): From 717b997d0e33ddae066f72cc6495cdb64b88d175 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:56:48 +0200 Subject: [PATCH 051/348] (refactor): reuse `compute_chunk_layout_for_axis_shape` functionality --- src/anndata/_io/specs/lazy_methods.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b977cd5ef..596c07575 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -63,6 +63,16 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): _DEFAULT_STRIDE = 1000 +def compute_chunk_layout_for_axis_shape( + chunk_axis_shape: int, full_axis_shape: int +) -> tuple[int, ...]: + n_strides, rest = np.divmod(full_axis_shape, chunk_axis_shape) + chunk = (chunk_axis_shape,) * n_strides + if rest > 0: + chunk += (rest,) + return chunk + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -103,10 +113,7 @@ def make_dask_chunk(block_id: tuple[int, int]): return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] - n_strides, rest = np.divmod(shape_major, stride) - chunks_major = (stride,) * n_strides - if rest > 0: - chunks_major += (rest,) + chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) chunks_minor = (shape_minor,) chunk_layout = ( (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) @@ -147,13 +154,10 @@ def make_dask_chunk(block_id: tuple[int, int]): idx += (slice(start, stop),) return f[idx] - chunk_layout = () - for i in range(len(shape)): - n_strides, rest = np.divmod(shape[i], chunks[i]) - chunk = (chunks[i],) * n_strides - if rest > 0: - chunk += (rest,) - chunk_layout += (chunk,) + chunk_layout = tuple( + compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) + for i in range(len(shape)) + ) return da.map_blocks( make_dask_chunk, From 14a24401e72b40bd2cf57309c68d6e82d2ca8838 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 17:11:13 +0200 Subject: [PATCH 052/348] (fix): upgrade `xarray` for new version --- pyproject.toml | 2 +- tests/test_read_backed_experimental.py | 135 ------------------------- 2 files changed, 1 insertion(+), 136 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7e7f1a303..06821c82f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,7 @@ test = [ "pytest-mock" ] gpu = ["cupy"] -xarray = ["xarray@git+https://github.com/ilan-gold/xarray#egg=ig/fix_equality_checl"] +xarray = ["xarray"] [tool.hatch.version] source = "vcs" diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 7633ec859..9b1a77069 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -5,15 +5,10 @@ import numpy as np import pandas as pd import pytest -import zarr from scipy import sparse from anndata._core.anndata import AnnData from anndata.experimental import read_backed -from anndata.experimental.backed._lazy_arrays import ( - CategoricalArray, - MaskedArray, -) from anndata.tests.helpers import ( AccessTrackingStore, as_dense_dask_array, @@ -41,136 +36,6 @@ def dskfmt(request): return request.param -@pytest.fixture() -def categorical_lazy_arr(tmp_path_factory): - base_path = tmp_path_factory.getbasetemp() - z = zarr.open_group(base_path, mode="w") - z["codes"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) - z["categories"] = np.array(["foo", "bar", "jazz"]) - z.attrs["ordered"] = False - z = zarr.open(base_path) - return CategoricalArray( - codes=z["codes"], - categories=z["categories"], - ordered=z.attrs["ordered"], - drop_unused_categories=True, - ) - - -@pytest.fixture() -def nullable_boolean_lazy_arr(tmp_path_factory): - base_path = tmp_path_factory.getbasetemp() - z = zarr.open_group(base_path, mode="w") - z["values"] = np.array( - [ - True, - False, - True, - False, - False, - True, - False, - False, - True, - True, - False, - False, - False, - True, - False, - True, - ] - ) - z["mask"] = np.array( - [ - True, - True, - True, - True, - True, - False, - False, - True, - False, - True, - True, - True, - True, - False, - True, - False, - ] - ) - z = zarr.open(base_path) - return MaskedArray(values=z["values"], mask=z["mask"], dtype_str="nullable-boolean") - - -@pytest.fixture() -def nullable_boolean_lazy_arr_no_mask(tmp_path_factory): - base_path = tmp_path_factory.getbasetemp() - z = zarr.open_group(base_path, mode="w") - z["values"] = np.array( - [ - True, - False, - True, - False, - False, - True, - False, - False, - True, - True, - False, - False, - False, - True, - False, - True, - ] - ) - z = zarr.open(base_path) - return MaskedArray(values=z["values"], mask=None, dtype_str="nullable-boolean") - - -@pytest.fixture() -def nullable_integer_lazy_arr(tmp_path_factory): - base_path = tmp_path_factory.getbasetemp() - z = zarr.open_group(base_path, mode="w") - z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) - z["mask"] = np.array( - [ - True, - True, - True, - True, - True, - False, - False, - True, - False, - True, - True, - True, - True, - False, - True, - False, - ] - ) - z = zarr.open(base_path) - return MaskedArray(values=z["values"], mask=z["mask"], dtype_str="nullable-integer") - - -@pytest.fixture() -def nullable_integer_lazy_arr_no_mask(tmp_path_factory): - base_path = tmp_path_factory.getbasetemp() - z = zarr.open_group(base_path, mode="w") - z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) - z = zarr.open(base_path) - return MaskedArray(values=z["values"], mask=None, dtype_str="nullable-integer") - - def test_access_count_obs_var(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" From fdf072748c31a75e3129cd15922626a4d45d0c82 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 17:58:19 +0200 Subject: [PATCH 053/348] (chore): tests passing again, allow `Dataset2D` as storage type --- src/anndata/_core/storage.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 86eea0a10..72207c6ef 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -71,12 +71,23 @@ def coerce_array( allow_df: bool = False, allow_array_like: bool = False, ): + try: + # Needs to be done here to prevent circular imports, and StorageType is immutable + from anndata.experimental.backed._xarray import Dataset2D + except ImportError: + + class Dataset2D: + @staticmethod + def __repr__(): + return "mock anndata.experimental.backed._xarray." + """Coerce arrays stored in layers/X, and aligned arrays ({obs,var}{m,p}).""" # If value is a scalar and we allow that, return it if allow_array_like and np.isscalar(value): return value # If value is one of the allowed types, return it - if isinstance(value, StorageType.classes()): + + if isinstance(value, StorageType.classes()) or isinstance(value, Dataset2D): # ???? if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warnings.warn(msg, ImplicitModificationWarning) From 2b86293dce90780d4d98f09f51e2fa2eec541813 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:09:14 +0200 Subject: [PATCH 054/348] (fix): remove unneeded `slice` arguments --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 596c07575..a3a29ac15 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -44,7 +44,7 @@ def make_index( min((block_id[is_csc] * stride) + stride, shape[0]), ) if is_csc: - return (slice(None, None, None), index1d) + return (slice(None), index1d) return (index1d,) From 8d5a9df7c5a0d11b50fba7bb1386f33cf41efb87 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:09:24 +0200 Subject: [PATCH 055/348] (fix): revert message --- src/anndata/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 91d8cdbcd..d4b9a38be 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -494,7 +494,7 @@ def assert_equal_cupy(a, b, exact=False, elem_name=None): def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): - assert a.shape == b.shape, (a.shape, b.shape) + assert a.shape == b.shape, format_msg(elem_name) np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name)) elif ( # Structured dtype not exact From 449fc1a6f7cd3234d910c2cd4e2e2edbf2e87826 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:10:22 +0200 Subject: [PATCH 056/348] (refactor): `make_index` -> `make_block_indexer` --- src/anndata/_io/specs/lazy_methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a3a29ac15..db2f35dfc 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,7 +19,7 @@ @overload -def make_index( +def make_block_indexer( *, is_csc: Literal[True], stride: int, @@ -27,7 +27,7 @@ def make_index( block_id: tuple[int, int], ) -> tuple[slice, slice]: ... @overload -def make_index( +def make_block_indexer( *, is_csc: Literal[False], stride: int, @@ -36,7 +36,7 @@ def make_index( ) -> tuple[slice]: ... -def make_index( +def make_block_indexer( *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( @@ -106,7 +106,7 @@ def make_dask_chunk(block_id: tuple[int, int]): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_index( + index = make_block_indexer( is_csc=is_csc, stride=stride, shape=shape, block_id=block_id ) chunk = mtx[index] From 1522de334c517a79df9f9bab032568366e2e8fac Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:11:20 +0200 Subject: [PATCH 057/348] (fix): export from `experimental` --- src/anndata/experimental/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 486f14e8d..993e26b28 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset -from anndata._io.specs import IOSpec, read_elem, write_elem +from anndata._io.specs import IOSpec, read_elem, read_elem_as_dask, write_elem from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk @@ -13,6 +13,7 @@ "AnnLoader", "read_elem", "write_elem", + "read_elem_as_dask", "read_dispatched", "write_dispatched", "IOSpec", From 71c150da759a2444826d0caceab803621a7ddd7a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:20:47 +0200 Subject: [PATCH 058/348] (fix): `callback` signature for `test_read_dispatched_null_case --- tests/test_io_dispatched.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 5dbb1229c..75f6b0033 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,7 +76,11 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - actual = read_dispatched(z, lambda _, __, x, ___, ____: read_elem(x)) + + def callback(read_func, elem_name, x, dataset_kwargs, iospec): + return read_elem(x) + + actual = read_dispatched(z, callback) assert_equal(expected, actual) From b441366a7d16728aa2f6cbe12eadaa6c7c7ea292 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:23:53 +0200 Subject: [PATCH 059/348] (chore): `get_elem_name` helper --- src/anndata/_io/specs/lazy_methods.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index db2f35dfc..83a19aab1 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,6 +1,7 @@ from __future__ import annotations from contextlib import contextmanager +from functools import singledispatch from pathlib import Path, PurePosixPath from types import MappingProxyType from typing import TYPE_CHECKING, Any, Literal, overload @@ -73,6 +74,21 @@ def compute_chunk_layout_for_axis_shape( return chunk +@singledispatch +def get_elem_name(x): + raise NotImplementedError(f"Not implemented for {type(x)}") + + +@get_elem_name.register(h5py.Group) +def _(x): + return x.name + + +@get_elem_name.register(ZarrArray) +def _(x): + return PurePosixPath(x.path).name + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -85,9 +101,7 @@ def read_sparse_as_dask( import dask.array as da path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem - elem_name = ( - elem.name if isinstance(elem, H5Group) else PurePosixPath(elem.path).name - ) + elem_name = get_elem_name(elem) shape: tuple[int, int] = elem.attrs["shape"] dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" From 0307a1dde3c8cab5d6d0f5d7363ee488902e5c74 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:24:34 +0200 Subject: [PATCH 060/348] (chore): use `H5Group` consistently --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 83a19aab1..4cc9b3d97 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -79,7 +79,7 @@ def get_elem_name(x): raise NotImplementedError(f"Not implemented for {type(x)}") -@get_elem_name.register(h5py.Group) +@get_elem_name.register(H5Group) def _(x): return x.name From ee075cd353443c088b2bf2423c1c1ed9269ca5c9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:40:57 +0200 Subject: [PATCH 061/348] (refactor): make `chunks` public facing API instead of `dataset_kwargs` --- src/anndata/_io/specs/registry.py | 17 ++++++----------- tests/test_io_elementwise.py | 16 +++++----------- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 6bf2a1964..f5fee7f27 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any, TypedDict +from typing import TYPE_CHECKING, Any from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import DaskArray, _read_attr @@ -342,12 +342,8 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -class DaskKwargs(TypedDict): - chunks: tuple[int, ...] - - def read_elem_as_dask( - elem: StorageType, dataset_kwargs: DaskKwargs | None = None + elem: StorageType, chunks: tuple[int, ...] | None = None ) -> DaskArray: """ Read an element from a store lazily. @@ -360,16 +356,15 @@ def read_elem_as_dask( ---------- elem The stored element. - dataset_kwargs, optional - Keyword arguments for dask array creation. Only `chunks` is supported with `n` elements, the same `n` as the size of the array. + chunks, optional + length `n`, the same `n` as the size of the underlying array. + Note that the minor axis dimension must match the shape for sparse. Returns ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem( - elem, dataset_kwargs=dataset_kwargs if dataset_kwargs is not None else {} - ) + return Reader(_LAZY_REGISTRY).read_elem(elem, dataset_kwargs={"chunks": chunks}) def write_elem( diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 5927536a4..692f21452 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -239,9 +239,7 @@ def test_read_lazy_2d_dask(sparse_format, store): ) def test_read_lazy_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) @@ -280,14 +278,10 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): store = file["/"] if arr_type == "dense": arr_store = create_dense_store(store) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) file.close() with ( @@ -303,9 +297,9 @@ def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): store = file["/"] arr_store = create_sparse_store(arr_type, store) with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): - read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE,))) + read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): - read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE, 10))) + read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From 89acec41ad2999466f2033f1bd10930d28ac343e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:41:34 +0200 Subject: [PATCH 062/348] (fix): regsiter for group not array --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 4cc9b3d97..24b556dd7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -84,7 +84,7 @@ def _(x): return x.name -@get_elem_name.register(ZarrArray) +@get_elem_name.register(ZarrGroup) def _(x): return PurePosixPath(x.path).name From 48b763076ecacefea7801d6603697605aa71ed79 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 11:22:16 +0200 Subject: [PATCH 063/348] (chore): add warning test --- tests/test_io_dispatched.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 75f6b0033..521ff0ad9 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -3,6 +3,7 @@ import re import h5py +import pytest import zarr from scipy import sparse @@ -85,6 +86,21 @@ def callback(read_func, elem_name, x, dataset_kwargs, iospec): assert_equal(expected, actual) +def test_read_dispatched_warns_with_no_dataset_kwargs(): + adata = gen_adata((100, 100)) + z = zarr.group() + write_elem(z, "/", adata) + + def callback(read_func, elem_name, x, iospec): + return read_elem(x) + + with pytest.warns( + UserWarning, + match="Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + ): + read_dispatched(z, callback) + + def test_write_dispatched_chunks(): from itertools import chain, repeat From 8712582a5def5426bd809319ac8da31050de13f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 11:48:10 +0200 Subject: [PATCH 064/348] (chore): make arg order consistent --- tests/test_io_dispatched.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 521ff0ad9..395e942c3 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -182,11 +182,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, iospec, dataset_kwargs): + def h5ad_reader(func, elem_name: str, elem, dataset_kwargs, iospec): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, iospec, dataset_kwargs): + def zarr_reader(func, elem_name: str, elem, dataset_kwargs, iospec): zarr_read_keys.append(elem_name) return func(elem) From 699db74da83a40ca1f63c0149a080cc9c3dc06b1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 12:04:02 +0200 Subject: [PATCH 065/348] (fix): test installations --- pyproject.toml | 5 +-- src/anndata/experimental/backed/_io.py | 6 +++- .../experimental/backed/_lazy_arrays.py | 33 +++++++++++++++++-- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 02d17d14d..becf17bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,13 +97,14 @@ test = [ "awkward>=2.3", "pyarrow", "pytest_memray", - "pytest-mock" + "pytest-mock", + "xarray>=2024.06.0" ] gpu = [ "cupy", "numpy<2.0.0", ] -xarray = ["xarray"] +xarray = ["xarray>=2024.06.0"] [tool.hatch.version] diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 3067ff6a2..c092be88d 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -11,7 +11,11 @@ import dask.array as da import h5py -import xarray as xr + +try: + import xarray as xr +except ImportError: + xr = None import zarr from ..._core.anndata import AnnData diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 154d610fc..04aaa2d35 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -4,7 +4,34 @@ from typing import Generic, TypeVar, Union import pandas as pd -import xarray as xr + +try: + import xarray as xr +except ImportError: + + class xr: + @property + def DataArray(self): + return None + + +try: + from xarray.backends.zarr import ZarrArrayWrapper +except ImportError: + + class ZarrArrayWrapper: + def __repr__(self) -> str: + return "mock ZarrArrayWrapper" + + +try: + from xarray.backends import BackendArray +except ImportError: + + class BackendArray: + def __repr__(self) -> str: + return "mock BackendArray" + from anndata._core.index import Index, _subset from anndata._core.views import as_view @@ -13,7 +40,7 @@ K = TypeVar("K", bound=Union[H5Array, ZarrArray]) -class ZarrOrHDF5Wrapper(xr.backends.zarr.ZarrArrayWrapper, Generic[K]): +class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): @singledispatchmethod # type: ignore def __init__(self, array: ZarrArray): return super().__init__(array) @@ -84,7 +111,7 @@ def __getitem__( return xr.core.extension_array.PandasExtensionArray(categorical_array) -class MaskedArray(xr.backends.BackendArray): +class MaskedArray(BackendArray): def __init__( self, values: ZarrArray | H5Array, From 61f42a12a337ddf48ebd6659c3f2e784b5323ae7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 12:17:53 +0200 Subject: [PATCH 066/348] (fix): `dims` -> `sizes` --- src/anndata/experimental/backed/_xarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 3513a8035..48f1dd8e3 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -17,9 +17,9 @@ def get_index_dim(ds): assert ( - len(ds.dims) == 1 + len(ds.sizes) == 1 ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" - return list(ds.dims.keys())[0] + return list(ds.sizes.keys())[0] class Dataset2D(xr.Dataset): From e1b7d0725834d72e9586d60da7e11f278b63d027 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 12:21:29 +0200 Subject: [PATCH 067/348] (fix): no` xarray` in min deps --- ci/scripts/min-deps.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index a7482e70e..241f53dd6 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -62,6 +62,8 @@ def extract_min_deps( for extra in req.extras: extra_deps = pyproject["project"]["optional-dependencies"][extra] dependencies += map(Requirement, extra_deps) + if req.name == "xarray": + continue # xarray requires too high a version of pandas and is experimental anyway else: yield min_dep(req) From 8028bce6b28ed75efb895a52b69519d6f355d0d1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 12:32:54 +0200 Subject: [PATCH 068/348] (fix): `elif` for reqname --- ci/scripts/min-deps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index 241f53dd6..878a86566 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -62,7 +62,7 @@ def extract_min_deps( for extra in req.extras: extra_deps = pyproject["project"]["optional-dependencies"][extra] dependencies += map(Requirement, extra_deps) - if req.name == "xarray": + elif req.name == "xarray": continue # xarray requires too high a version of pandas and is experimental anyway else: yield min_dep(req) From c3cd0e69e72c3c6f785bbbf911f78d23d210bf2e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 12:49:43 +0200 Subject: [PATCH 069/348] (fix): import --- src/anndata/experimental/backed/_lazy_arrays.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 04aaa2d35..842431265 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -64,7 +64,7 @@ def __getitem__(self, key): ) -class CategoricalArray(xr.backends.BackendArray): +class CategoricalArray(BackendArray): def __init__( self, codes: ZarrArray | H5Array, From cda8aa77013ba55884510ef82355eeee5d067ff8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:20:38 +0200 Subject: [PATCH 070/348] (feat): add `callback` typing for `read_dispatched` --- src/anndata/_io/specs/methods.py | 55 ++++++++++++++++-------- src/anndata/_io/specs/registry.py | 30 +++++++++++-- src/anndata/experimental/_dispatch_io.py | 4 +- 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 85bf6dddc..fe2b1bda9 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -4,7 +4,7 @@ from functools import partial from itertools import product from types import MappingProxyType -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal from warnings import warn import h5py @@ -26,6 +26,7 @@ CupyCSCMatrix, CupyCSRMatrix, DaskArray, + SpArray, ZarrArray, ZarrGroup, _decode_structured_array, @@ -33,11 +34,13 @@ _read_attr, ) -from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial +from .registry import _REGISTRY, IOSpec, Reader, read_elem, read_elem_partial if TYPE_CHECKING: from os import PathLike + from anndata._core.storage import StorageType + H5Array = h5py.Dataset H5Group = h5py.Group H5File = h5py.File @@ -109,7 +112,9 @@ def wrapper( @_REGISTRY.register_read(H5File, IOSpec("", "")) @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) -def read_basic(elem, _reader): +def read_basic( + elem: StorageType, _reader: Reader +) -> dict | np.ndarray[Any, Any] | np.ndarray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -129,7 +134,17 @@ def read_basic(elem, _reader): @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) -def read_basic_zarr(elem, _reader): +def read_basic_zarr( + elem: StorageType, _reader: Reader +) -> ( + dict + | Any + | np.ndarray[np.void] + | np.ndarray[Any, np.dtype[np.float64]] + | np.ndarray[Any, np.dtype[Any]] + | sparse.spmatrix + | SpArray +): from anndata._io import zarr warn( @@ -265,7 +280,7 @@ def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem, _reader): +def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -300,7 +315,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem, _reader): +def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, Any]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -374,7 +389,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem, _reader): +def read_array(elem: StorageType, _reader: Reader) -> np.ndarray: return elem[()] @@ -460,7 +475,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader): +def read_recarray(d, _reader) -> np.recarray | np.ndarray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -620,7 +635,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem, _reader): +def read_sparse(elem: StorageType, _reader: Reader) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -658,7 +673,7 @@ def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem, _reader): +def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -720,7 +735,7 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem, _reader): +def read_dataframe(elem: StorageType, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -761,7 +776,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem, _reader): +def read_dataframe_0_1_0(elem: StorageType, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -825,7 +840,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): +def read_categorical(elem: StorageType, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -869,7 +884,9 @@ def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer(elem, _reader): +def read_nullable_integer( + elem: StorageType, _reader: Reader +) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -880,7 +897,9 @@ def read_nullable_integer(elem, _reader): @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean(elem, _reader): +def read_nullable_boolean( + elem: StorageType, _reader: Reader +) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -896,7 +915,7 @@ def read_nullable_boolean(elem, _reader): @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem, _reader): +def read_scalar(elem: StorageType, _reader: Reader) -> np.number: return elem[()] @@ -929,12 +948,12 @@ def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem, _reader): +def read_hdf5_string(elem: StorageType, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem, _reader): +def read_zarr_string(elem: StorageType, _reader: Reader) -> str: return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a8357295d..80f612c5c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Protocol, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import _read_attr @@ -64,9 +64,17 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator +class reader(Protocol): + def __call__( + self, + elem: StorageType, + _reader: Reader, + ) -> Any: ... + + class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], reader] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], Callable @@ -232,8 +240,24 @@ def _iter_patterns( yield t +InMemoryType = TypeVar("InMemoryType") + + +class read_callback(Protocol): + def __call__( + self, + /, + read_func: Callable[[StorageType, Reader], InMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InMemoryType: ... + + class Reader: - def __init__(self, registry: IORegistry, callback: Callable | None = None) -> None: + def __init__( + self, registry: IORegistry, callback: read_callback | None = None + ) -> None: self.registry = registry self.callback = callback diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index 2a399d540..86dc936a4 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -6,13 +6,13 @@ if TYPE_CHECKING: from collections.abc import Mapping - from anndata._io.specs import IOSpec + from anndata._io.specs.registry import read_callback from anndata._types import GroupStorageType, StorageType def read_dispatched( elem: StorageType, - callback: Callable[[Callable[[StorageType], Any], str, StorageType, IOSpec], Any], + callback: read_callback, ) -> Any: """ Read elem, calling the callback at each sub-element. From e8f62f44af517d55b19fcbfd2778809667dcea10 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:27:46 +0200 Subject: [PATCH 071/348] (chore): use `npt.NDArray` --- src/anndata/_io/specs/methods.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index fe2b1bda9..e752d3a78 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -39,6 +39,8 @@ if TYPE_CHECKING: from os import PathLike + from numpy import typing as npt + from anndata._core.storage import StorageType H5Array = h5py.Dataset @@ -114,7 +116,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict | np.ndarray[Any, Any] | np.ndarray | sparse.spmatrix | SpArray: +) -> dict | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -136,15 +138,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> ( - dict - | Any - | np.ndarray[np.void] - | np.ndarray[Any, np.dtype[np.float64]] - | np.ndarray[Any, np.dtype[Any]] - | sparse.spmatrix - | SpArray -): +) -> dict | Any | npt.NDArray | npt.NDArray[np.float64] | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -389,7 +383,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: StorageType, _reader: Reader) -> np.ndarray: +def read_array(elem: StorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -475,7 +469,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader) -> np.recarray | np.ndarray: +def read_recarray(d, _reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) From f6e48acfc10b47f15ca1109e4e691ebb841a3aa9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:29:11 +0200 Subject: [PATCH 072/348] (fix): remove uneceesary union --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e752d3a78..e7886f59f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -138,7 +138,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict | Any | npt.NDArray | npt.NDArray[np.float64] | sparse.spmatrix | SpArray: +) -> dict | Any | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( From 4de3246638de21e79daea55db81b9d7fdc858b55 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:31:23 +0200 Subject: [PATCH 073/348] (chore): release note --- docs/release-notes/0.10.9.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index a0beab5f2..2e60dcf2d 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -10,6 +10,8 @@ #### Documentation +* add `callback` typing for {func}`~anndata.experimental.read_dispatched` and {func}`~anndata.experimental.write_dispatched` {pr}`1557` {user}`ilan-gold` + #### Performance * Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold` From ba817e0bec259beeeda3da6229855ff025335403 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:51:31 +0200 Subject: [PATCH 074/348] (fix); try protocol docs --- docs/conf.py | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index ec253fc68..12ee4dd0f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,6 +51,7 @@ "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", + "sphinx_toolbox.more_autodoc.autoprotocol", "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", diff --git a/pyproject.toml b/pyproject.toml index 0ea5f8962..f6c3b8b09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", From 438d28ddf749a52ca1e6ceb44381cc3bf7f742c3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:03:30 +0200 Subject: [PATCH 075/348] (feat): create `InMemoryElem` + `DictElemType` to remove `Any` --- src/anndata/_io/specs/methods.py | 9 ++++---- src/anndata/_io/specs/registry.py | 31 +++++++++++++++++++++----- src/anndata/_types.py | 36 ++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e7886f59f..e5cd92337 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -4,7 +4,7 @@ from functools import partial from itertools import product from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal from warnings import warn import h5py @@ -42,6 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType + from anndata._types import DictElemType H5Array = h5py.Dataset H5Group = h5py.Group @@ -116,7 +117,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -138,7 +139,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict | Any | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -309,7 +310,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, Any]: +def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, DictElemType]: return {k: _reader.read_elem(v) for k, v in elem.items()} diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 80f612c5c..dfa43a4bc 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,13 +6,34 @@ from types import MappingProxyType from typing import TYPE_CHECKING, Any, Protocol, TypeVar +import numpy as np +import pandas as pd +from numpy import typing as npt +from scipy import sparse + +from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata.compat import _read_attr +from anndata._types import DictElemType +from anndata.compat import SpArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from anndata._types import GroupStorageType, StorageType + from anndata._core.storage import StorageType + from anndata._types import GroupStorageType + +InMemoryElem = ( + dict[str, DictElemType] + | npt.NDArray + | sparse.spmatrix + | SpArray + | AnnData + | pd.DataFrame + | pd.Categorical + | str + | np.number + | pd.api.extensions.ExtensionArray +) # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" @@ -69,7 +90,7 @@ def __call__( self, elem: StorageType, _reader: Reader, - ) -> Any: ... + ) -> InMemoryElem: ... class IORegistry: @@ -240,7 +261,7 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType") +InMemoryType = TypeVar("InMemoryType", bound=InMemoryElem) class read_callback(Protocol): @@ -266,7 +287,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> Any: + ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" from functools import partial diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 17dd014d5..357bf457c 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -6,7 +6,23 @@ from typing import Union -from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup +import numpy as np +import pandas as pd +from scipy import sparse + +from anndata._core.sparse_dataset import BaseCompressedSparseDataset +from anndata.compat import ( + AwkArray, + CupyArray, + CupySparseMatrix, + DaskArray, + H5Array, + H5Group, + SpArray, + ZappyArray, + ZarrArray, + ZarrGroup, +) __all__ = [ "ArrayStorageType", @@ -14,6 +30,24 @@ "StorageType", ] +DictElemType = ( + np.ndarray + | np.ma.MaskedArray + | sparse.spmatrix + | SpArray + | H5Array + | ZarrArray + | ZappyArray + | BaseCompressedSparseDataset + | DaskArray + | CupyArray + | CupySparseMatrix + | AwkArray + | pd.DataFrame + | np.number + | str +) + ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] From 296ea3ff87a2a46acd4d10739fc6331e119394bd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:07:21 +0200 Subject: [PATCH 076/348] (chore): refactor `DictElemType` -> `InMemoryArrayOrScalarType` for reuse --- src/anndata/_io/specs/methods.py | 10 ++++++---- src/anndata/_io/specs/registry.py | 16 ++++------------ src/anndata/_types.py | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e5cd92337..ec54e55b8 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -42,7 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType - from anndata._types import DictElemType + from anndata._types import InMemoryArrayOrScalarType H5Array = h5py.Dataset H5Group = h5py.Group @@ -117,7 +117,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -139,7 +139,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -310,7 +310,9 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, DictElemType]: +def read_mapping( + elem: StorageType, _reader: Reader +) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index dfa43a4bc..5134ffab1 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,15 +6,12 @@ from types import MappingProxyType from typing import TYPE_CHECKING, Any, Protocol, TypeVar -import numpy as np import pandas as pd -from numpy import typing as npt -from scipy import sparse from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata._types import DictElemType -from anndata.compat import SpArray, _read_attr +from anndata._types import InMemoryArrayOrScalarType +from anndata.compat import _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable @@ -23,15 +20,10 @@ from anndata._types import GroupStorageType InMemoryElem = ( - dict[str, DictElemType] - | npt.NDArray - | sparse.spmatrix - | SpArray + dict[str, InMemoryArrayOrScalarType] + | InMemoryArrayOrScalarType | AnnData - | pd.DataFrame | pd.Categorical - | str - | np.number | pd.api.extensions.ExtensionArray ) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 357bf457c..21d235cdd 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,7 +30,7 @@ "StorageType", ] -DictElemType = ( +InMemoryArrayOrScalarType = ( np.ndarray | np.ma.MaskedArray | sparse.spmatrix From cf13a575e0f3c50c7e8ad8f5140c666aee3798c7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:12:20 +0200 Subject: [PATCH 077/348] (fix): use `Union` --- src/anndata/_types.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 21d235cdd..c08477e2b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,23 +30,23 @@ "StorageType", ] -InMemoryArrayOrScalarType = ( - np.ndarray - | np.ma.MaskedArray - | sparse.spmatrix - | SpArray - | H5Array - | ZarrArray - | ZappyArray - | BaseCompressedSparseDataset - | DaskArray - | CupyArray - | CupySparseMatrix - | AwkArray - | pd.DataFrame - | np.number - | str -) +InMemoryArrayOrScalarType = Union[ + np.typing.NDArray, + np.ma.MaskedArray, + sparse.spmatrix, + SpArray, + H5Array, + ZarrArray, + ZappyArray, + BaseCompressedSparseDataset, + DaskArray, + CupyArray, + CupySparseMatrix, + AwkArray, + pd.DataFrame, + np.number, + str, +] ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] From d02ba49f6689ce35f802b6c774ea9b5a2ea0b32e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:16:31 +0200 Subject: [PATCH 078/348] (fix): more `Union` --- src/anndata/_io/specs/registry.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 5134ffab1..3137240c2 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Protocol, TypeVar +from typing import TYPE_CHECKING, Any, Protocol, TypeVar, Union import pandas as pd @@ -19,13 +19,13 @@ from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryElem = ( - dict[str, InMemoryArrayOrScalarType] - | InMemoryArrayOrScalarType - | AnnData - | pd.Categorical - | pd.api.extensions.ExtensionArray -) +InMemoryElem = Union[ + dict[str, InMemoryArrayOrScalarType], + InMemoryArrayOrScalarType, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" From 6970a97d3cef2e903217053b9a713dc95fd959b9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:17:02 +0200 Subject: [PATCH 079/348] (refactor): `InMemoryElem` -> `InMemoryReadElem` --- src/anndata/_io/specs/registry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 3137240c2..e57392792 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -19,7 +19,7 @@ from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryElem = Union[ +InMemoryReadElem = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, AnnData, @@ -82,7 +82,7 @@ def __call__( self, elem: StorageType, _reader: Reader, - ) -> InMemoryElem: ... + ) -> InMemoryReadElem: ... class IORegistry: @@ -253,7 +253,7 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType", bound=InMemoryElem) +InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) class read_callback(Protocol): @@ -279,7 +279,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> InMemoryElem: + ) -> InMemoryReadElem: """Read an element from a store. See exported function for more details.""" from functools import partial From 2282351956feda47b741895e240a882201acc9b3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 15:26:10 +0200 Subject: [PATCH 080/348] (chore): add needed types to public export + docs fix --- docs/api.md | 3 ++- docs/conf.py | 2 +- pyproject.toml | 1 - src/anndata/experimental/__init__.py | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/api.md b/docs/api.md index fb8f40f93..496f4e0f3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -131,7 +131,8 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched experimental.IOSpec - + experimental.read_callback + experimental.StorageType ``` ## Errors and warnings diff --git a/docs/conf.py b/docs/conf.py index 12ee4dd0f..952791856 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,6 @@ "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", - "sphinx_toolbox.more_autodoc.autoprotocol", "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", @@ -95,6 +94,7 @@ # TODO: sphinx’ builtin autodoc.typehints extension isn’t handled by `qualname_overrides` yet # https://github.com/theislab/scanpydoc/issues/140 ("py:class", "h5py._hl.group.Group"), + ("py:class", "h5py._hl.dataset.Dataset"), ] suppress_warnings = [ "ref.citation", diff --git a/pyproject.toml b/pyproject.toml index f6c3b8b09..0ea5f8962 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,6 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 486f14e8d..6b78e6433 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -3,6 +3,8 @@ from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset from anndata._io.specs import IOSpec, read_elem, write_elem +from .._core.storage import StorageType +from .._io.specs.registry import read_callback from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -20,4 +22,6 @@ "sparse_dataset", "CSRDataset", "CSCDataset", + "read_callback", + "StorageType", ] From 47d87bb4a12c08ba5c3a19914205e8c93ade5c35 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 15:30:25 +0200 Subject: [PATCH 081/348] (fix): `DataArray` type --- src/anndata/experimental/backed/_lazy_arrays.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 842431265..99d722115 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -7,12 +7,13 @@ try: import xarray as xr + from xarray import DataArray except ImportError: + xr = None - class xr: - @property - def DataArray(self): - return None + class DataArray: + def __repr__(self) -> str: + return "mock DataArray" try: @@ -141,11 +142,11 @@ def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: return xr.core.extension_array.PandasExtensionArray(pd.array(values)) -@_subset.register(xr.DataArray) -def _subset_masked(a: xr.DataArray, subset_idx: Index): +@_subset.register(DataArray) +def _subset_masked(a: DataArray, subset_idx: Index): return a[subset_idx] -@as_view.register(xr.DataArray) -def _view_pd_boolean_array(a: xr.DataArray, view_args): +@as_view.register(DataArray) +def _view_pd_boolean_array(a: DataArray, view_args): return a From b58897e7dc8386623f73c8ae4646bc731a2207d0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 15:31:07 +0200 Subject: [PATCH 082/348] (fix): `dims`->`sizes` --- src/anndata/experimental/backed/_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 48f1dd8e3..59b04b7a0 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -32,7 +32,7 @@ def index(self) -> pd.Index: def shape( self, ): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm - return [self.dims[get_index_dim(self)], len(self)] + return [self.sizes[get_index_dim(self)], len(self)] @property def iloc(self): From 6165f07b7d5fd54113068cccc37303bf2400d80b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 16:15:43 +0200 Subject: [PATCH 083/348] (fix): move to `_compat` for `ImportError` stuff --- src/anndata/experimental/backed/_compat.py | 42 +++++++++++++++++++ src/anndata/experimental/backed/_io.py | 6 +-- .../experimental/backed/_lazy_arrays.py | 31 +------------- src/anndata/experimental/backed/_xarray.py | 6 +-- 4 files changed, 48 insertions(+), 37 deletions(-) create mode 100644 src/anndata/experimental/backed/_compat.py diff --git a/src/anndata/experimental/backed/_compat.py b/src/anndata/experimental/backed/_compat.py new file mode 100644 index 000000000..037306698 --- /dev/null +++ b/src/anndata/experimental/backed/_compat.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +try: + from xarray import DataArray +except ImportError: + + class DataArray: + def __repr__(self) -> str: + return "mock DataArray" + + +try: + import xarray as xr +except ImportError: + xr = None + + +try: + from xarray.backends.zarr import ZarrArrayWrapper +except ImportError: + + class ZarrArrayWrapper: + def __repr__(self) -> str: + return "mock ZarrArrayWrapper" + + +try: + from xarray.backends import BackendArray +except ImportError: + + class BackendArray: + def __repr__(self) -> str: + return "mock BackendArray" + + +try: + from xarray import Dataset +except ImportError: + + class Dataset: + def __repr__(self) -> str: + return "mock Dataset" diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index c092be88d..2aa752e92 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -11,17 +11,13 @@ import dask.array as da import h5py - -try: - import xarray as xr -except ImportError: - xr = None import zarr from ..._core.anndata import AnnData from ..._core.sparse_dataset import sparse_dataset from ...compat import DaskArray from .. import read_dispatched +from ._compat import xr from ._lazy_arrays import CategoricalArray, MaskedArray from ._xarray import Dataset2D diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 99d722115..89df81f01 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -5,39 +5,12 @@ import pandas as pd -try: - import xarray as xr - from xarray import DataArray -except ImportError: - xr = None - - class DataArray: - def __repr__(self) -> str: - return "mock DataArray" - - -try: - from xarray.backends.zarr import ZarrArrayWrapper -except ImportError: - - class ZarrArrayWrapper: - def __repr__(self) -> str: - return "mock ZarrArrayWrapper" - - -try: - from xarray.backends import BackendArray -except ImportError: - - class BackendArray: - def __repr__(self) -> str: - return "mock BackendArray" - - from anndata._core.index import Index, _subset from anndata._core.views import as_view from anndata.compat import H5Array, ZarrArray +from ._compat import BackendArray, DataArray, ZarrArrayWrapper, xr + K = TypeVar("K", bound=Union[H5Array, ZarrArray]) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 59b04b7a0..6d3b0665f 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -7,12 +7,12 @@ import pandas as pd -import xarray as xr from ..._core.anndata import _gen_dataframe, _remove_unused_categories from ..._core.file_backing import to_memory from ..._core.index import Index, _subset from ..._core.views import as_view +from ._compat import DataArray, Dataset def get_index_dim(ds): @@ -22,7 +22,7 @@ def get_index_dim(ds): return list(ds.sizes.keys())[0] -class Dataset2D(xr.Dataset): +class Dataset2D(Dataset): @property def index(self) -> pd.Index: coord = list(self.coords.keys())[0] @@ -48,7 +48,7 @@ def __getitem__(self, idx): @_subset.register(Dataset2D) -def _(a: xr.DataArray, subset_idx: Index): +def _(a: DataArray, subset_idx: Index): key = get_index_dim(a) if ( isinstance(subset_idx, tuple) and len(subset_idx) == 1 From a996081625fd44f2b6866c0c2739ff44e2ad6908 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 11:23:47 +0200 Subject: [PATCH 084/348] (chore): type `write_elem` functions --- src/anndata/_io/specs/methods.py | 170 ++++++++++++++++++++++++++----- 1 file changed, 144 insertions(+), 26 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index ec54e55b8..2be9e8964 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -34,7 +34,7 @@ _read_attr, ) -from .registry import _REGISTRY, IOSpec, Reader, read_elem, read_elem_partial +from .registry import _REGISTRY, IOSpec, Reader, Writer, read_elem, read_elem_partial if TYPE_CHECKING: from os import PathLike @@ -42,7 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType - from anndata._types import InMemoryArrayOrScalarType + from anndata._types import GroupStorageType, InMemoryArrayOrScalarType H5Array = h5py.Dataset H5Group = h5py.Group @@ -255,7 +255,13 @@ def _read_partial(group, *, items=None, indices=(slice(None), slice(None))): @_REGISTRY.register_write(ZarrGroup, AnnData, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_write(H5Group, AnnData, IOSpec("anndata", "0.1.0")) -def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): +def write_anndata( + f: GroupStorageType, + k: str, + adata: AnnData, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obs", adata.obs, dataset_kwargs=dataset_kwargs) @@ -296,7 +302,13 @@ def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: @_REGISTRY.register_write(H5Group, Raw, IOSpec("raw", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, Raw, IOSpec("raw", "0.1.0")) -def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): +def write_raw( + f: GroupStorageType, + k: str, + raw: Raw, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) _writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "var", raw.var, dataset_kwargs=dataset_kwargs) @@ -318,7 +330,13 @@ def read_mapping( @_REGISTRY.register_write(H5Group, dict, IOSpec("dict", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, dict, IOSpec("dict", "0.1.0")) -def write_mapping(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_mapping( + f: GroupStorageType, + k: str, + v: dict, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) for sub_k, sub_v in v.items(): _writer.write_elem(g, sub_k, sub_v, dataset_kwargs=dataset_kwargs) @@ -331,7 +349,13 @@ def write_mapping(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(H5Group, list, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, list, IOSpec("array", "0.2.0")) -def write_list(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_list( + f: GroupStorageType, + k: str, + elem: list, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): _writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs) @@ -346,7 +370,13 @@ def write_list(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) -def write_basic(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic( + f: GroupStorageType, + k: str, + elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): """Write methods which underlying library handles natively.""" f.create_dataset(k, data=elem, **dataset_kwargs) @@ -360,7 +390,13 @@ def write_basic(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0")) -def write_basic_dask_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic_dask_zarr( + f: ZarrGroup, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): import dask.array as da g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) @@ -370,7 +406,13 @@ def write_basic_dask_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({ # Adding this separately because h5py isn't serializable # https://github.com/pydata/xarray/issues/4242 @_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0")) -def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic_dask_h5( + f: H5Group, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): import dask.array as da import dask.config as dc @@ -420,7 +462,13 @@ def read_string_array_partial(d, items=None, indices=slice(None)): ) @_REGISTRY.register_write(H5Group, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(H5Group, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) -def write_vlen_string_array(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_vlen_string_array( + f: H5Group, + k: str, + elem: np.ndarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): """Write methods which underlying library handles nativley.""" str_dtype = h5py.special_dtype(vlen=str) f.create_dataset(k, data=elem.astype(str_dtype), dtype=str_dtype, **dataset_kwargs) @@ -435,7 +483,11 @@ def write_vlen_string_array(f, k, elem, _writer, dataset_kwargs=MappingProxyType @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) def write_vlen_string_array_zarr( - f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) + f: ZarrGroup, + k: str, + elem: np.ndarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), ): import numcodecs @@ -483,13 +535,25 @@ def read_recarray(d, _reader) -> np.recarray | npt.NDArray: @_REGISTRY.register_write(H5Group, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.recarray, IOSpec("rec-array", "0.2.0")) -def write_recarray(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_recarray( + f: H5Group, + k: str, + elem: np.ndarray | np.recarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.recarray, IOSpec("rec-array", "0.2.0")) -def write_recarray_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_recarray_zarr( + f: ZarrGroup, + k: str, + elem: np.ndarray | np.recarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): from anndata.compat import _to_fixed_length_strings f.create_dataset(k, data=_to_fixed_length_strings(elem), **dataset_kwargs) @@ -501,10 +565,10 @@ def write_recarray_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) def write_sparse_compressed( - f, - key, - value, - _writer, + f: GroupStorageType, + key: str, + value: sparse.spmatrix | SpArray, + _writer: Writer, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}), ): @@ -560,7 +624,13 @@ def write_sparse_compressed( @_REGISTRY.register_write(H5Group, CSCDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, CSRDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, CSCDataset, IOSpec("", "0.1.0")) -def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_sparse_dataset( + f: GroupStorageType, + k: str, + elem: CSCDataset | CSRDataset, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): write_sparse_compressed( f, k, @@ -586,7 +656,13 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_write( ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) -def write_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_dask_sparse( + f: GroupStorageType, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): sparse_format = elem._meta.format def as_int64_indices(x): @@ -657,7 +733,13 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) @_REGISTRY.register_write( ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") ) -def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_awkward( + f: GroupStorageType, + k: str, + v: views.AwkwardArrayView | AwkArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): from anndata.compat import awkward as ak group = f.require_group(k) @@ -689,7 +771,13 @@ def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: @_REGISTRY.register_write(H5Group, pd.DataFrame, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, views.DataFrameView, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.DataFrame, IOSpec("dataframe", "0.2.0")) -def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): +def write_dataframe( + f: GroupStorageType, + key: str, + df: views.DataFrameView | pd.DataFrame, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): # Check arguments for reserved in ("_index",): if reserved in df.columns: @@ -825,7 +913,13 @@ def read_partial_dataframe_0_1_0( @_REGISTRY.register_write(H5Group, pd.Categorical, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.Categorical, IOSpec("categorical", "0.2.0")) -def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_categorical( + f: GroupStorageType, + k: str, + v: pd.Categorical, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) g.attrs["ordered"] = bool(v.ordered) @@ -872,7 +966,13 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): @_REGISTRY.register_write( ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) -def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_nullable_integer( + f: GroupStorageType, + k: str, + v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) if v._mask is not None: _writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs) @@ -916,11 +1016,23 @@ def read_scalar(elem: StorageType, _reader: Reader) -> np.number: return elem[()] -def write_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({})): +def write_scalar( + f: GroupStorageType, + key: str, + value, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): return f.create_dataset(key, data=np.array(value), **dataset_kwargs) -def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({})): +def write_hdf5_scalar( + f: H5Group, + key: str, + value, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): # Can’t compress scalars, error is thrown dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) @@ -960,7 +1072,13 @@ def read_zarr_string(elem: StorageType, _reader: Reader) -> str: @_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0")) @_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0")) -def write_string(f, k, v, _writer, dataset_kwargs): +def write_string( + f: GroupStorageType, + k: str, + v: np.str_ | str, + _writer: Writer, + dataset_kwargs: MappingProxyType, +): dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) dataset_kwargs.pop("compression_opts", None) From f6e457b7db9d02ec8693a46dfd568a365680c117 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 11:40:47 +0200 Subject: [PATCH 085/348] (chore): create `write_callback` protocol --- src/anndata/_io/specs/registry.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index e57392792..98035536d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -293,8 +293,24 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) +class write_callback(Protocol): + def __call__( + self, + /, + write_func: Callable[ + [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None + ], + store: GroupStorageType, + elem_name: str, + elem: InMemoryReadElem, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> InMemoryType: ... + + class Writer: - def __init__(self, registry: IORegistry, callback: Callable | None = None): + def __init__(self, registry: IORegistry, callback: write_callback | None = None): self.registry = registry self.callback = callback From 4416526c3a295c527020d733a4527dd364ca6832 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:12:49 +0200 Subject: [PATCH 086/348] (chore): export + docs --- docs/api.md | 1 + src/anndata/experimental/__init__.py | 3 ++- src/anndata/experimental/_dispatch_io.py | 9 +++------ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/api.md b/docs/api.md index 496f4e0f3..9eb57a0a9 100644 --- a/docs/api.md +++ b/docs/api.md @@ -132,6 +132,7 @@ Utilities for customizing the IO process: experimental.write_dispatched experimental.IOSpec experimental.read_callback + experimental.write_callback experimental.StorageType ``` diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 6b78e6433..e042d5e96 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,7 +4,7 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import read_callback +from .._io.specs.registry import read_callback, write_callback from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -23,5 +23,6 @@ "CSRDataset", "CSCDataset", "read_callback", + "write_callback", "StorageType", ] diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index c7c79df1a..549ca85c4 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -4,10 +4,10 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import Callable, Mapping + from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import read_callback + from anndata._io.specs.registry import read_callback, write_callback from anndata._types import GroupStorageType, StorageType @@ -53,10 +53,7 @@ def write_dispatched( store: GroupStorageType, key: str, elem: Any, - callback: Callable[ - [Callable[[StorageType, str, Any], None], GroupStorageType, str, Any, dict], - None, - ], + callback: write_callback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: From fbe44f0dc30c12833c896ef37f6c39a891fb7bd7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:29:08 +0200 Subject: [PATCH 087/348] (fix): add string descriptions --- src/anndata/_io/specs/registry.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f77471519..9366ce305 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,6 +267,13 @@ def __call__( iospec: IOSpec, ) -> InMemoryType: ... + """Callback used in {func}`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Returns + ------- + The element read from the store. + """ + class Reader: def __init__( @@ -307,7 +314,9 @@ def __call__( *, iospec: IOSpec, dataset_kwargs: MappingProxyType, - ) -> InMemoryType: ... + ) -> None: ... + + """Callback used in {func}`anndata.experimental.write_dispatched` to customize writing an element to a store.""" class Writer: From 8c1f01d7b100c471167a5449a767fe34f8c2fe9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:33:52 +0200 Subject: [PATCH 088/348] (fix): try sphinx protocol doc --- docs/conf.py | 1 + pyproject.toml | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 952791856..223fae81d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,6 +60,7 @@ "sphinx.ext.linkcode", "nbsphinx", "IPython.sphinxext.ipython_console_highlighting", + "sphinx_toolbox.more_autodoc.autoprotocol", ] myst_enable_extensions = [ "html_image", # So README.md can be used on github and sphinx docs diff --git a/pyproject.toml b/pyproject.toml index 03a409c7e..f6c3b8b09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", @@ -184,9 +185,6 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"] -[tool.ruff.lint.flake8-type-checking] -exempt-modules = [] -strict = true [tool.codespell] skip = ".git,*.pdf,*.svg" From a7d412a72e095572e5c56ec180ae6b7d9105976f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 17:07:52 +0200 Subject: [PATCH 089/348] (fix): try ignoring exports --- docs/conf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 223fae81d..8eaa58dd5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -96,6 +96,13 @@ # https://github.com/theislab/scanpydoc/issues/140 ("py:class", "h5py._hl.group.Group"), ("py:class", "h5py._hl.dataset.Dataset"), + # for experimental callback exports + ("py:class", "anndata.compat.ZappyArray"), + ("py:class", "anndata.compat.DaskArray"), + ("py:class", "anndata.compat.CupyArray"), + ("py:class", "anndata.compat.CupySparseMatrix"), + ("py:class", "awkward.highlevel.Array"), + ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ] suppress_warnings = [ "ref.citation", From 4d56396c9952b0d75e16b7467c55f110d1b9cbdd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 17:51:33 +0200 Subject: [PATCH 090/348] (fix): remap callback internal usages --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8eaa58dd5..aba358a71 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -132,6 +132,8 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", + "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", + "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", } # -- Social cards --------------------------------------------------------- From 2012ee5fc08c318e706104e31766c6d69935edd9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 18:52:31 +0200 Subject: [PATCH 091/348] (fix): add docstring --- src/anndata/_io/specs/registry.py | 33 +++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 9366ce305..0b9834d2d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,7 +267,19 @@ def __call__( iospec: IOSpec, ) -> InMemoryType: ... - """Callback used in {func}`anndata.experimental.read_dispatched` to customize reading an element from a store. + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the :param:`iospec`. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. Returns ------- @@ -316,7 +328,24 @@ def __call__( dataset_kwargs: MappingProxyType, ) -> None: ... - """Callback used in {func}`anndata.experimental.write_dispatched` to customize writing an element to a store.""" + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the :param:`iospec`. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`. + """ class Writer: From f65f0652cee9a130bd01f91bee97644424ecf09c Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 9 Jul 2024 08:40:29 +0200 Subject: [PATCH 092/348] Discard changes to pyproject.toml --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f6c3b8b09..03a409c7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,6 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", @@ -185,6 +184,9 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"] +[tool.ruff.lint.flake8-type-checking] +exempt-modules = [] +strict = true [tool.codespell] skip = ".git,*.pdf,*.svg" From 8f6ea498a1fff13b547630411ce103764ea82979 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 08:47:29 +0200 Subject: [PATCH 093/348] re-add dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 03a409c7e..813ccee62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", From 155a21e69c6ab69f8c1c9d616c20bb73ad5a3727 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 10:31:55 +0200 Subject: [PATCH 094/348] Fix docs --- src/anndata/_io/specs/registry.py | 52 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 0b9834d2d..d3b26fb99 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -258,22 +258,13 @@ def _iter_patterns( class read_callback(Protocol): - def __call__( - self, - /, - read_func: Callable[[StorageType, Reader], InMemoryType], - elem_name: str, - elem: StorageType, - iospec: IOSpec, - ) -> InMemoryType: ... - """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. Params ------ read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the :param:`iospec`. + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. elem_name The key to read in from the group. elem @@ -286,6 +277,15 @@ def __call__( The element read from the store. """ + def __call__( + self, + /, + read_func: Callable[[StorageType, Reader], InMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InMemoryType: ... + class Reader: def __init__( @@ -314,27 +314,13 @@ def read_elem( class write_callback(Protocol): - def __call__( - self, - /, - write_func: Callable[ - [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None - ], - store: GroupStorageType, - elem_name: str, - elem: InMemoryReadElem, - *, - iospec: IOSpec, - dataset_kwargs: MappingProxyType, - ) -> None: ... - """ Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. Params ------ write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the :param:`iospec`. + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. store The store to which `elem` should be written. elem_name @@ -344,9 +330,23 @@ def __call__( iospec Internal AnnData encoding specification for the element. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ + def __call__( + self, + /, + write_func: Callable[ + [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None + ], + store: GroupStorageType, + elem_name: str, + elem: InMemoryReadElem, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> None: ... + class Writer: def __init__(self, registry: IORegistry, callback: write_callback | None = None): From daae3e548e4e7fe1da06777fe260da403684f24d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 11:11:14 +0200 Subject: [PATCH 095/348] Almost works --- docs/api.md | 4 ++ docs/conf.py | 5 ++ pyproject.toml | 2 +- src/anndata/_io/specs/registry.py | 88 ++++++++++++++-------------- src/anndata/_types.py | 10 +++- src/anndata/experimental/__init__.py | 13 +++- 6 files changed, 73 insertions(+), 49 deletions(-) diff --git a/docs/api.md b/docs/api.md index 9eb57a0a9..c05efb71b 100644 --- a/docs/api.md +++ b/docs/api.md @@ -131,6 +131,10 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched experimental.IOSpec + experimental.InMemoryReadElem + experimental.InMemoryArrayOrScalarType + experimental.Reader + experimental.Writer experimental.read_callback experimental.write_callback experimental.StorageType diff --git a/docs/conf.py b/docs/conf.py index aba358a71..0f4d24f0a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,6 +70,11 @@ # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" +autodoc_type_aliases = dict( + InMemoryReadElem="anndata.experimental.InMemoryReadElem", + InMemoryType="anndata.experimental.InMemoryArrayOrScalarType", + InMemoryArrayOrScalarType="anndata.experimental.InMemoryArrayOrScalarType", +) issues_github_path = "scverse/anndata" # autodoc_default_flags = ['members'] napoleon_google_docstring = False diff --git a/pyproject.toml b/pyproject.toml index 813ccee62..310f57fd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ dev = [ doc = [ "sphinx>=4.4", "sphinx-book-theme>=1.1.0", - "sphinx-autodoc-typehints>=1.11.0", + "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", "sphinx-copybutton", "sphinx-toolbox", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d3b26fb99..4c324a005 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, Union import pandas as pd @@ -15,12 +15,12 @@ if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any + from typing import Any, TypeAlias from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryReadElem = Union[ +InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, AnnData, @@ -257,26 +257,7 @@ def _iter_patterns( InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) -class read_callback(Protocol): - """ - Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. - - Params - ------ - read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. - elem_name - The key to read in from the group. - elem - The element to read from. - iospec - Internal AnnData encoding specification for the element. - - Returns - ------- - The element read from the store. - """ - +class read_callback(Protocol, Generic[InMemoryType]): def __call__( self, /, @@ -284,7 +265,26 @@ def __call__( elem_name: str, elem: StorageType, iospec: IOSpec, - ) -> InMemoryType: ... + ) -> InMemoryType: + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. + + Returns + ------- + The element read from the store. + """ + ... class Reader: @@ -314,25 +314,6 @@ def read_elem( class write_callback(Protocol): - """ - Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. - - Params - ------ - write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. - store - The store to which `elem` should be written. - elem_name - The key to read in from the group. - elem - The element to write out. - iospec - Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. - """ - def __call__( self, /, @@ -345,7 +326,26 @@ def __call__( *, iospec: IOSpec, dataset_kwargs: MappingProxyType, - ) -> None: ... + ) -> None: + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + ... class Writer: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index c08477e2b..5827f5b6b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,10 +4,11 @@ from __future__ import annotations -from typing import Union +from typing import TYPE_CHECKING, Union import numpy as np import pandas as pd +from numpy.typing import NDArray from scipy import sparse from anndata._core.sparse_dataset import BaseCompressedSparseDataset @@ -24,14 +25,17 @@ ZarrGroup, ) +if TYPE_CHECKING: + from typing import TypeAlias + __all__ = [ "ArrayStorageType", "GroupStorageType", "StorageType", ] -InMemoryArrayOrScalarType = Union[ - np.typing.NDArray, +InMemoryArrayOrScalarType: TypeAlias = Union[ + NDArray, np.ma.MaskedArray, sparse.spmatrix, SpArray, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index e042d5e96..af21c8e15 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,7 +4,14 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import read_callback, write_callback +from .._io.specs.registry import ( + InMemoryArrayOrScalarType, + InMemoryReadElem, + Reader, + Writer, + read_callback, + write_callback, +) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -22,6 +29,10 @@ "sparse_dataset", "CSRDataset", "CSCDataset", + "InMemoryReadElem", + "InMemoryArrayOrScalarType", + "Reader", + "Writer", "read_callback", "write_callback", "StorageType", From c415ae4f916f81d86c0109649d479d6ab730954f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 11:34:53 +0200 Subject: [PATCH 096/348] works! --- docs/conf.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 0f4d24f0a..48a9ee4a9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,11 +70,6 @@ # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" -autodoc_type_aliases = dict( - InMemoryReadElem="anndata.experimental.InMemoryReadElem", - InMemoryType="anndata.experimental.InMemoryArrayOrScalarType", - InMemoryArrayOrScalarType="anndata.experimental.InMemoryArrayOrScalarType", -) issues_github_path = "scverse/anndata" # autodoc_default_flags = ['members'] napoleon_google_docstring = False @@ -108,6 +103,7 @@ ("py:class", "anndata.compat.CupySparseMatrix"), ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), + ("py:obj", "numpy._typing._array_like._ScalarType_co"), ] suppress_warnings = [ "ref.citation", @@ -140,6 +136,12 @@ def setup(app: Sphinx): "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", } +autodoc_type_aliases = dict( + NDArray=":data:`~numpy.typing.NDArray`", + InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + InMemoryType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", + InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", +) # -- Social cards --------------------------------------------------------- From 00010b8b0dc09a249554f9ef241df35041413af0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:45:19 +0200 Subject: [PATCH 097/348] (chore): use pascal-case --- docs/conf.py | 4 +-- src/anndata/_io/specs/registry.py | 8 +++--- src/anndata/experimental/__init__.py | 8 +++--- src/anndata/experimental/_dispatch_io.py | 33 ++++-------------------- 4 files changed, 15 insertions(+), 38 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 48a9ee4a9..e018d0602 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -133,8 +133,8 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", - "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", - "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", + "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", + "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 4c324a005..d4ef3b91c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -257,7 +257,7 @@ def _iter_patterns( InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) -class read_callback(Protocol, Generic[InMemoryType]): +class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, @@ -289,7 +289,7 @@ def __call__( class Reader: def __init__( - self, registry: IORegistry, callback: read_callback | None = None + self, registry: IORegistry, callback: ReadCallback | None = None ) -> None: self.registry = registry self.callback = callback @@ -313,7 +313,7 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class write_callback(Protocol): +class WriteCallback(Protocol): def __call__( self, /, @@ -349,7 +349,7 @@ def __call__( class Writer: - def __init__(self, registry: IORegistry, callback: write_callback | None = None): + def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry self.callback = callback diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index af21c8e15..4b2101ffe 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,10 +7,10 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + ReadCallback, Reader, + WriteCallback, Writer, - read_callback, - write_callback, ) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk @@ -33,7 +33,7 @@ "InMemoryArrayOrScalarType", "Reader", "Writer", - "read_callback", - "write_callback", + "ReadCallback", + "WriteCallback", "StorageType", ] diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index 549ca85c4..b48c09bb8 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -7,13 +7,13 @@ from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import read_callback, write_callback + from anndata._io.specs.registry import ReadCallback, WriteCallback from anndata._types import GroupStorageType, StorageType def read_dispatched( elem: StorageType, - callback: read_callback, + callback: ReadCallback, ) -> Any: """ Read elem, calling the callback at each sub-element. @@ -24,19 +24,7 @@ def read_dispatched( Storage container (e.g. `h5py.Group`, `zarr.Group`). This must have anndata element specifications. callback - Function to call at each anndata encoded element. See details below for - signature. - - - The callback has the following signature: - - * `read_func` (`Callable`): A callable which takes the encoded element and returns it's decoded value. - This is the default decoding function, and what to call if you don't want to modify the decoding. - It will call this callback again at the next element encoding it sees. - * `key` (`str`): They absolute key of the element in the store. This will be an absolute key. - * `elem` (`StorageType`): The encoded element. - * `iospec` (`IOSpec`): The specification of the element. This is passed as a keyword argument. - + Function to call at each anndata encoded element. See Also -------- @@ -53,7 +41,7 @@ def write_dispatched( store: GroupStorageType, key: str, elem: Any, - callback: write_callback, + callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: @@ -69,22 +57,11 @@ def write_dispatched( elem The element to write. Probably an AnnData. callback - Function called when writing each element. See below for signature. + Function called when writing each element. dataset_kwargs Keyword arguments to pass to the dataset creation function. - The callback has the following signature: - - * `write_func` (`Callable`): A callable which takes the in memory element and writes it to the store. - This is the default encoding function, and what to call if you don't want to change behaviour at this level. - * `store` (`GroupStorageType`): The store to write to. - * `key` (`str`): The key to write elem into store at. This will be an absolute key. - * `elem` (`Any`): The element to write. - * `dataset_kwargs` (`dict`): Keyword arguments to pass to the dataset creation function. This is passed as a keyword argument. - * `iospec` (`IOSpec`): The specification of the element. This is passed as a keyword argument. - - See Also -------- From 0bd87fcf23c16189648add7f638ae39d0e47b357 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:45:33 +0200 Subject: [PATCH 098/348] (feat): type read/write funcs in callback --- docs/api.md | 6 ++++-- src/anndata/_io/specs/registry.py | 23 ++++++++++++++++------- src/anndata/experimental/__init__.py | 4 ++++ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/docs/api.md b/docs/api.md index c05efb71b..b0a1dfc61 100644 --- a/docs/api.md +++ b/docs/api.md @@ -134,9 +134,11 @@ Utilities for customizing the IO process: experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType experimental.Reader + experimental.Read experimental.Writer - experimental.read_callback - experimental.write_callback + experimental.Write + experimental.ReadCallback + experimental.WriteCallback experimental.StorageType ``` diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d4ef3b91c..0fc360715 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -78,7 +78,7 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class reader(Protocol): +class Read(Protocol): def __call__( self, elem: StorageType, @@ -86,12 +86,23 @@ def __call__( ) -> InMemoryReadElem: ... +class Write(Protocol): + def __call__( + self, + f: GroupStorageType, + k: str, + v: InMemoryReadElem, + _writer: Writer, + dataset_kwargs: MappingProxyType, + ) -> None: ... + + class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], reader] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ - tuple[type, type | tuple[type, str], frozenset[str]], Callable + tuple[type, type | tuple[type, str], frozenset[str]], Write ] = {} self.write_specs: dict[type | tuple[type, str], IOSpec] = {} @@ -261,7 +272,7 @@ class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, - read_func: Callable[[StorageType, Reader], InMemoryType], + read_func: Read, elem_name: str, elem: StorageType, iospec: IOSpec, @@ -317,9 +328,7 @@ class WriteCallback(Protocol): def __call__( self, /, - write_func: Callable[ - [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None - ], + write_func: Write, store: GroupStorageType, elem_name: str, elem: InMemoryReadElem, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 4b2101ffe..78726490e 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,8 +7,10 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + Read, ReadCallback, Reader, + Write, WriteCallback, Writer, ) @@ -32,7 +34,9 @@ "InMemoryReadElem", "InMemoryArrayOrScalarType", "Reader", + "Read", "Writer", + "Write", "ReadCallback", "WriteCallback", "StorageType", From 5997678dc2776d1e24a65b9580356f339f10598b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:52:02 +0200 Subject: [PATCH 099/348] (fix): use generic for `Read` as well. --- src/anndata/_io/specs/registry.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 0fc360715..c6068f65f 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -78,7 +78,10 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class Read(Protocol): +InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem, covariant=True) + + +class Read(Protocol, Generic[InMemoryType]): def __call__( self, elem: StorageType, @@ -265,14 +268,11 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) - - class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, - read_func: Read, + read_func: Read[InMemoryType], elem_name: str, elem: StorageType, iospec: IOSpec, From f20833201c7197c0e311e489027071bbad5aa7b1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:55:44 +0200 Subject: [PATCH 100/348] (fix): need more aliases --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index e018d0602..f5e54f8ea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -135,6 +135,8 @@ def setup(app: Sphinx): "anndata._core.anndata.AnnData": "anndata.AnnData", "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", + "anndata._io.specs.registry.Read": "anndata.experimental.Read", + "anndata._io.specs.registry.Write": "anndata.experimental.Write", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", From eb69fcba70d8d2816350bc20141c5c1e6237c9a2 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 13:07:42 +0200 Subject: [PATCH 101/348] Split table, format --- docs/api.md | 13 +++++++++++-- src/anndata/experimental/_dispatch_io.py | 8 +++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/api.md b/docs/api.md index b0a1dfc61..a6d92211c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -82,7 +82,8 @@ Writing to other formats. API's in the experimental module are currently in development and subject to change at any time. ``` -Two classes for working with batched access to collections of many `AnnData` objects or `h5ad` files. In paritcular, for pytorch-based models. +Two classes for working with batched access to collections of many `AnnData` objects or `h5ad` files. +In particular, for pytorch-based models. ```{eval-rst} .. autosummary:: @@ -112,7 +113,7 @@ Out of core concatenation experimental.concat_on_disk ``` -Low level methods for reading and writing elements of an `` AnnData` `` object to a store: +Low level methods for reading and writing elements of an `AnnData` object to a store: ```{eval-rst} .. autosummary:: @@ -130,6 +131,14 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched +``` + +Types used by the former: + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + experimental.IOSpec experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b48c09bb8..22b684cf5 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -21,13 +21,13 @@ def read_dispatched( Params ------ elem - Storage container (e.g. `h5py.Group`, `zarr.Group`). This must have anndata - element specifications. + Storage container (e.g. `h5py.Group`, `zarr.Group`). + This must have anndata element specifications. callback Function to call at each anndata encoded element. + See Also -------- - :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Reader @@ -61,10 +61,8 @@ def write_dispatched( dataset_kwargs Keyword arguments to pass to the dataset creation function. - See Also -------- - :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Writer From 477bbefc50507cbc9c62cef2f49f6b1d9f98fbbd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 13:52:36 +0200 Subject: [PATCH 102/348] (refactor): move to `_types` file --- docs/conf.py | 8 +- src/anndata/_io/specs/registry.py | 100 +++-------------- src/anndata/_types.py | 131 ++++++++++++++++++++++- src/anndata/experimental/__init__.py | 6 +- src/anndata/experimental/_dispatch_io.py | 8 +- 5 files changed, 159 insertions(+), 94 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f5e54f8ea..96d94fa58 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -133,10 +133,10 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", - "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", - "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", - "anndata._io.specs.registry.Read": "anndata.experimental.Read", - "anndata._io.specs.registry.Write": "anndata.experimental.Write", + "anndata._types.ReadCallback": "anndata.experimental.ReadCallback", + "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", + "anndata._types.Read": "anndata.experimental.Read", + "anndata._types.Write": "anndata.experimental.Write", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index c6068f65f..fbebcfc06 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Union import pandas as pd @@ -18,7 +18,13 @@ from typing import Any, TypeAlias from anndata._core.storage import StorageType - from anndata._types import GroupStorageType + from anndata._types import ( + GroupStorageType, + Read, + ReadCallback, + Write, + WriteCallback, + ) InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], @@ -78,28 +84,6 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem, covariant=True) - - -class Read(Protocol, Generic[InMemoryType]): - def __call__( - self, - elem: StorageType, - _reader: Reader, - ) -> InMemoryReadElem: ... - - -class Write(Protocol): - def __call__( - self, - f: GroupStorageType, - k: str, - v: InMemoryReadElem, - _writer: Writer, - dataset_kwargs: MappingProxyType, - ) -> None: ... - - class IORegistry: def __init__(self): self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} @@ -268,36 +252,6 @@ def _iter_patterns( yield t -class ReadCallback(Protocol, Generic[InMemoryType]): - def __call__( - self, - /, - read_func: Read[InMemoryType], - elem_name: str, - elem: StorageType, - iospec: IOSpec, - ) -> InMemoryType: - """ - Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. - - Params - ------ - read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. - elem_name - The key to read in from the group. - elem - The element to read from. - iospec - Internal AnnData encoding specification for the element. - - Returns - ------- - The element read from the store. - """ - ... - - class Reader: def __init__( self, registry: IORegistry, callback: ReadCallback | None = None @@ -324,37 +278,13 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class WriteCallback(Protocol): - def __call__( - self, - /, - write_func: Write, - store: GroupStorageType, - elem_name: str, - elem: InMemoryReadElem, - *, - iospec: IOSpec, - dataset_kwargs: MappingProxyType, - ) -> None: - """ - Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. - - Params - ------ - write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. - store - The store to which `elem` should be written. - elem_name - The key to read in from the group. - elem - The element to write out. - iospec - Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. - """ - ... +InMemoryReadElem: TypeAlias = Union[ + dict[str, InMemoryArrayOrScalarType], + InMemoryArrayOrScalarType, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] class Writer: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 5827f5b6b..1cfaedbdf 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np import pandas as pd @@ -26,8 +26,11 @@ ) if TYPE_CHECKING: + from types import MappingProxyType from typing import TypeAlias + from anndata._io.specs.registry import IOSpec, Reader, Writer + __all__ = [ "ArrayStorageType", "GroupStorageType", @@ -55,3 +58,129 @@ ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] + +ContravariantInMemoryType = TypeVar( + "ContravariantInMemoryType", + bound="InMemoryReadElem", # noqa: F821 + contravariant=True, +) +CovariantInMemoryType = TypeVar( + "CovariantInMemoryType", + bound="InMemoryReadElem", # noqa: F821 + covariant=True, +) +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") # noqa: F821 + + +class Read(Protocol[CovariantInMemoryType]): + def __call__( + self, + elem: StorageType, + _reader: Reader, + ) -> CovariantInMemoryType: + """Low-level reading function for an element. + + Parameters + ---------- + elem + The element to read from. + _reader + The :class:`anndata.experimental.Reader` instance. + + Returns + ------- + The element read from the store. + """ + + ... + + +class Write(Protocol[ContravariantInMemoryType]): + def __call__( + self, + f: GroupStorageType, + k: str, + v: ContravariantInMemoryType, + _writer: Writer, + dataset_kwargs: MappingProxyType, + ) -> None: + """Low-level writing function for an element. + + Parameters + ---------- + f + The store to which `elem` should be written. + k + The key to read in from the group. + v + The element to write out. + _writer + The :class:`anndata.experimental.Writer` instance. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + + ... + + +class ReadCallback(Protocol[InvariantInMemoryType]): + def __call__( + self, + /, + read_func: Read[InvariantInMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InvariantInMemoryType: + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. + + Returns + ------- + The element read from the store. + """ + ... + + +class WriteCallback(Protocol[InvariantInMemoryType]): + def __call__( + self, + /, + write_func: Write[InvariantInMemoryType], + store: GroupStorageType, + elem_name: str, + elem: InvariantInMemoryType, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> None: + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + ... diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 78726490e..9e3f91191 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,12 +7,14 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + Reader, + Writer, +) +from .._types import ( Read, ReadCallback, - Reader, Write, WriteCallback, - Writer, ) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b48c09bb8..789f158af 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -7,8 +7,12 @@ from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import ReadCallback, WriteCallback - from anndata._types import GroupStorageType, StorageType + from anndata._types import ( + GroupStorageType, + ReadCallback, + StorageType, + WriteCallback, + ) def read_dispatched( From 8d23f6f443a21895474d888726f29fa193d6f0d9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 14:45:56 +0200 Subject: [PATCH 103/348] bump scanpydoc --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 310f57fd2..17898ec55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ doc = [ "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.4", + "scanpydoc[theme,typehints] >=0.13.5", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks From 9b647c2838b7722b831e2dc3aa1b6148678950f6 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 15:18:59 +0200 Subject: [PATCH 104/348] Some basic syntax fixes --- src/anndata/_io/specs/registry.py | 9 --------- src/anndata/_types.py | 12 +++++------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index fbebcfc06..f61fd9ee3 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -278,15 +278,6 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -InMemoryReadElem: TypeAlias = Union[ - dict[str, InMemoryArrayOrScalarType], - InMemoryArrayOrScalarType, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] - - class Writer: def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 1cfaedbdf..0ddce2e5d 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -11,8 +11,8 @@ from numpy.typing import NDArray from scipy import sparse -from anndata._core.sparse_dataset import BaseCompressedSparseDataset -from anndata.compat import ( +from ._core.sparse_dataset import BaseCompressedSparseDataset +from .compat import ( AwkArray, CupyArray, CupySparseMatrix, @@ -29,7 +29,7 @@ from types import MappingProxyType from typing import TypeAlias - from anndata._io.specs.registry import IOSpec, Reader, Writer + from ._io.specs.registry import IOSpec, Reader, Writer __all__ = [ "ArrayStorageType", @@ -91,8 +91,7 @@ def __call__( ------- The element read from the store. """ - - ... + ... class Write(Protocol[ContravariantInMemoryType]): @@ -119,8 +118,7 @@ def __call__( dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ - - ... + ... class ReadCallback(Protocol[InvariantInMemoryType]): From 5ef93e1b3fa5edab43830d827f6f244cc2addaf2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:52:34 +0200 Subject: [PATCH 105/348] (fix): change `Read{Callback}` type for kwargs --- src/anndata/_types.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 0ddce2e5d..1f97673cf 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -77,6 +77,8 @@ def __call__( self, elem: StorageType, _reader: Reader, + *, + dataset_kwargs: MappingProxyType, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -86,6 +88,8 @@ def __call__( The element to read from. _reader The :class:`anndata.experimental.Reader` instance. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. Returns ------- @@ -129,6 +133,8 @@ def __call__( elem_name: str, elem: StorageType, iospec: IOSpec, + *, + dataset_kwargs: MappingProxyType, ) -> InvariantInMemoryType: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. @@ -143,6 +149,8 @@ def __call__( The element to read from. iospec Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. Returns ------- From 9cfe9086ce6a97e9d6cf7833b9e496728881e00a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:55:52 +0200 Subject: [PATCH 106/348] (chore): test `chunks `argument --- tests/test_io_elementwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 1d7d01241..31149a1ae 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -282,6 +282,7 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) + assert X_dask_from_disk.chunksize == chunks X_from_disk = read_elem(arr_store["X"]) file.close() with ( From 99fc6db05a6b075b0c01760ea9d8e6f1f8e9ec35 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:57:32 +0200 Subject: [PATCH 107/348] (fix): type `read_recarray` --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 581d5d90a..9bc882c57 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -527,7 +527,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader) -> np.recarray | npt.NDArray: +def read_recarray(d: StorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) From b5bccc3818e9bed4f610bf31a367b79ee3a3531e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:59:50 +0200 Subject: [PATCH 108/348] (fix): `GroupyStorageType` not `StorageType` --- src/anndata/_io/specs/methods.py | 33 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 9bc882c57..a3e6662c4 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,6 @@ from numpy import typing as npt - from anndata._core.storage import StorageType from anndata._types import GroupStorageType, InMemoryArrayOrScalarType from anndata.compat import SpArray @@ -119,7 +118,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -141,7 +140,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -284,7 +283,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -326,7 +325,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) def read_mapping( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -431,7 +430,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: StorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: GroupStorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -527,7 +526,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: StorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: GroupStorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -711,7 +710,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem: StorageType, _reader: Reader) -> sparse.spmatrix | SpArray: +def read_sparse(elem: GroupStorageType, _reader: Reader) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -755,7 +754,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: +def read_awkward(elem: GroupStorageType, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -823,7 +822,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem: StorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -864,7 +863,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem: StorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -934,7 +933,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem: StorageType, _reader: Reader) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -985,7 +984,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -998,7 +997,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1015,7 +1014,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: StorageType, _reader: Reader) -> np.number: +def read_scalar(elem: GroupStorageType, _reader: Reader) -> np.number: return elem[()] @@ -1060,12 +1059,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: StorageType, _reader: Reader) -> str: +def read_hdf5_string(elem: GroupStorageType, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: StorageType, _reader: Reader) -> str: +def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: return str(elem[()]) From e5ea2b0520ead9675df7ca5c35ff6157f1fcee14 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:04:47 +0200 Subject: [PATCH 109/348] (fix): little type fixes --- src/anndata/_io/specs/methods.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a3e6662c4..6f58093a8 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,11 @@ from numpy import typing as npt - from anndata._types import GroupStorageType, InMemoryArrayOrScalarType + from anndata._types import ( + ArrayStorageType, + GroupStorageType, + InMemoryArrayOrScalarType, + ) from anndata.compat import SpArray from .registry import Reader, Writer @@ -118,7 +122,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: GroupStorageType, _reader: Reader + elem: H5File | H5Group | H5Array, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -140,7 +144,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: GroupStorageType, _reader: Reader + elem: ZarrGroup | ZarrArray, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -283,7 +287,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: GroupStorageType, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -430,7 +434,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: GroupStorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: ArrayStorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -447,7 +451,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d, _reader): +def read_string_array(d: H5Array, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -526,7 +530,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: GroupStorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -1014,7 +1018,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: GroupStorageType, _reader: Reader) -> np.number: +def read_scalar(elem: ArrayStorageType, _reader: Reader) -> np.number: return elem[()] @@ -1059,12 +1063,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: GroupStorageType, _reader: Reader) -> str: +def read_hdf5_string(elem: H5Array, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: +def read_zarr_string(elem: ZarrArray, _reader: Reader) -> str: return str(elem[()]) @@ -1075,7 +1079,7 @@ def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: @_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0")) @_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0")) def write_string( - f: GroupStorageType, + f: H5Group, k: str, v: np.str_ | str, _writer: Writer, From 6ac72d63d8e9b92c40df12354aec0c79651b8ccb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:22:48 +0200 Subject: [PATCH 110/348] (fix): clarify `H5File` typing --- src/anndata/_io/specs/methods.py | 8 +++----- src/anndata/_types.py | 9 ++++++--- src/anndata/compat/__init__.py | 1 + 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 6f58093a8..3f4e73358 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -26,6 +26,9 @@ CupyCSCMatrix, CupyCSRMatrix, DaskArray, + H5Array, + H5File, + H5Group, ZarrArray, ZarrGroup, _decode_structured_array, @@ -50,11 +53,6 @@ from .registry import Reader, Writer -H5Array = h5py.Dataset -H5Group = h5py.Group -H5File = h5py.File - - #################### # Dask utils # #################### diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 0ddce2e5d..b9b0065fd 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,6 +30,9 @@ from typing import TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer + from .compat import ( + H5File, + ) __all__ = [ "ArrayStorageType", @@ -75,7 +78,7 @@ class Read(Protocol[CovariantInMemoryType]): def __call__( self, - elem: StorageType, + elem: StorageType | H5File, _reader: Reader, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -97,7 +100,7 @@ def __call__( class Write(Protocol[ContravariantInMemoryType]): def __call__( self, - f: GroupStorageType, + f: StorageType, k: str, v: ContravariantInMemoryType, _writer: Writer, @@ -156,7 +159,7 @@ def __call__( self, /, write_func: Write[InvariantInMemoryType], - store: GroupStorageType, + store: StorageType, elem_name: str, elem: InvariantInMemoryType, *, diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index da67141b1..6edc7e2c9 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -49,6 +49,7 @@ class Empty: Index = Union[Index1D, tuple[Index1D, Index1D], scipy.sparse.spmatrix, SpArray] H5Group = h5py.Group H5Array = h5py.Dataset +H5File = h5py.File ############################# From 989dc6546275d99b04520f60a9e230f06d586600 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:44:15 +0200 Subject: [PATCH 111/348] (fix): dask doc --- src/anndata/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 1f97673cf..742089ce6 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -150,7 +150,7 @@ def __call__( iospec Internal AnnData encoding specification for the element. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- From 36b0207e799f220f8aead955274a360bb58917a3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:52:47 +0200 Subject: [PATCH 112/348] (fix): dask docs --- docs/conf.py | 1 + src/anndata/_types.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 96d94fa58..fe6833d8d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -126,6 +126,7 @@ def setup(app: Sphinx): scipy=("https://docs.scipy.org/doc/scipy/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), + dask=("https://docs.dask.org/en/stable/", None), xarray=("https://xarray.pydata.org/en/stable/", None), ) qualname_overrides = { diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 742089ce6..b63ce83ed 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -89,7 +89,7 @@ def __call__( _reader The :class:`anndata.experimental.Reader` instance. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- @@ -120,7 +120,7 @@ def __call__( _writer The :class:`anndata.experimental.Writer` instance. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. """ ... From ca6cf6629103b5e67e2c48aca249714a38ad07de Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:15:43 +0200 Subject: [PATCH 113/348] (fix): typing --- src/anndata/_io/specs/lazy_methods.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a65fbc91b..b4f9ed982 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,6 +19,8 @@ from collections.abc import Mapping from typing import Any, Literal + from .registry import Reader + @overload def make_block_indexer( @@ -96,7 +98,7 @@ def _(x): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, - _reader, + _reader: Reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da @@ -145,9 +147,8 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem, - _reader, - chunks: tuple[int] | None = None, + elem: H5Array, + _reader: Reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da From eabaf3512ffba12abff52d8657f696a8e25ae924 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:20:20 +0200 Subject: [PATCH 114/348] (fix): handle case when `chunks` is `None` --- src/anndata/_io/specs/lazy_methods.py | 7 +++++-- src/anndata/_io/specs/registry.py | 4 +++- tests/test_io_elementwise.py | 26 ++++++++++++++++++++++---- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b4f9ed982..c0597a49f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -184,8 +184,11 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem, _reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}) + elem: ZarrArray, + _reader: Reader, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): + chunks: tuple[int, ...] = dataset_kwargs.get("chunks", elem.chunks) import dask.array as da - return da.from_zarr(elem) + return da.from_zarr(elem, chunks=chunks) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 1a4d2913f..067728d41 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -387,7 +387,9 @@ def read_elem_as_dask( ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem(elem, dataset_kwargs={"chunks": chunks}) + return Reader(_LAZY_REGISTRY).read_elem( + elem, dataset_kwargs={"chunks": chunks} if chunks is not None else {} + ) def write_elem( diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 31149a1ae..fa867cac6 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -59,7 +59,7 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: sparse_formats = ["csr", "csc"] -SIZE = 1000 +SIZE = 2500 @pytest.fixture(params=sparse_formats) @@ -235,7 +235,15 @@ def test_read_lazy_2d_dask(sparse_format, store): @pytest.mark.parametrize( ("n_dims", "chunks"), - [(1, (100,)), (1, (400,)), (2, (100, 100)), (2, (400, 400)), (2, (200, 400))], + [ + (1, (100,)), + (1, (400,)), + (2, (100, 100)), + (2, (400, 400)), + (2, (200, 400)), + (1, None), + (2, None), + ], ) def test_read_lazy_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) @@ -269,7 +277,13 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( ("arr_type", "chunks"), - [("dense", (100, 100)), ("csc", (SIZE, 10)), ("csr", (10, SIZE))], + [ + ("dense", (100, 100)), + ("csc", (SIZE, 10)), + ("csr", (10, SIZE)), + ("csc", None), + ("csr", None), + ], ) def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): import dask.distributed as dd @@ -282,7 +296,11 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - assert X_dask_from_disk.chunksize == chunks + if chunks is not None: + assert X_dask_from_disk.chunksize == chunks + else: + # assert that sparse chunks are set correctly by default + assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE X_from_disk = read_elem(arr_store["X"]) file.close() with ( From d2b39f4611a022eceee4dfc072a8dbefb057ef8a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:25:44 +0200 Subject: [PATCH 115/348] (feat): use `read_elem_as_dask` --- src/anndata/experimental/backed/_io.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 2aa752e92..ea8bef345 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -6,15 +6,15 @@ TYPE_CHECKING, ) +from anndata._io.specs.registry import read_elem_as_dask + if TYPE_CHECKING: from collections.abc import MutableMapping -import dask.array as da import h5py import zarr from ..._core.anndata import AnnData -from ..._core.sparse_dataset import sparse_dataset from ...compat import DaskArray from .. import read_dispatched from ._compat import xr @@ -122,20 +122,13 @@ def callback(func, elem_name: str, elem, iospec): mask=elem["mask"] if "mask" in elem else None, dtype_str=iospec.encoding_type, ) - elif iospec.encoding_type in {"array", "string-array"}: - if is_h5: - if iospec.encoding_type == "string-array": - if ( - "read_dataset" not in dir() - ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 - from ..._io.h5ad import read_dataset - elem = read_dataset(elem) - if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(1000,) * len(elem.shape)) - return da.from_array(elem) - return da.from_zarr(elem) - elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - return sparse_dataset(elem) + elif iospec.encoding_type in { + "csr_matrix", + "csc_matrix", + "array", + "string-array", + }: + return read_elem_as_dask(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) return func(elem) From 4c398c3f2329ab3c0212670652ea1936255375e5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:26:59 +0200 Subject: [PATCH 116/348] (feat): add string-array reading --- src/anndata/_io/specs/lazy_methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index c0597a49f..a4aefc07e 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,6 +145,7 @@ def make_dask_chunk(block_id: tuple[int, int]): return da_mtx +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, @@ -182,6 +183,7 @@ def make_dask_chunk(block_id: tuple[int, int]): ) +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, From 23b53d675c431f415885a7ec0e8c2aba820173c3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:39:17 +0200 Subject: [PATCH 117/348] (fix): string array handling --- src/anndata/_io/specs/lazy_methods.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a4aefc07e..abbad85f4 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -146,6 +146,21 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) +def read_h5_string_array( + elem: H5Array, + _reader: Reader, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): + import dask.array as da + + from anndata._io.h5ad import read_dataset + + return da.from_array( + read_dataset(elem), + chunks=dataset_kwargs.get("chunks", (_DEFAULT_STRIDE,) * len(elem.shape)), + ) + + @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, From d6fc8a47934d2a424bad95c0c43ad6898da6f17a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:40:03 +0200 Subject: [PATCH 118/348] (fix): remove `string-array` because it is not tested --- src/anndata/_io/specs/lazy_methods.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a4aefc07e..c0597a49f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,7 +145,6 @@ def make_dask_chunk(block_id: tuple[int, int]): return da_mtx -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, @@ -183,7 +182,6 @@ def make_dask_chunk(block_id: tuple[int, int]): ) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, From 1fe1de4325805289a81a32f1efc40b63d78206a3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:44:05 +0200 Subject: [PATCH 119/348] (fix): add back `string-array` for `zarr` --- src/anndata/_io/specs/lazy_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 7ef970f66..abbad85f4 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -198,6 +198,7 @@ def make_dask_chunk(block_id: tuple[int, int]): ) +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, From d969b1811f755f72a246c95789d7d18e3714df07 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 10:09:36 +0200 Subject: [PATCH 120/348] (fix): add `dataset_kwargs` --- src/anndata/experimental/backed/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index ea8bef345..612614f76 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -54,7 +54,7 @@ def read_backed( else: f = h5py.File(store, mode="r") - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, iospec, dataset_kwargs): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): cols = [ "obs", From 02b1b1d94a64c9ee6e2a090742a6e3ca1a9c2d8f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 10:10:59 +0200 Subject: [PATCH 121/348] (fix): `compat` `zarr` import --- src/anndata/experimental/backed/_io.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 612614f76..8e729ecb4 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -6,13 +6,10 @@ TYPE_CHECKING, ) -from anndata._io.specs.registry import read_elem_as_dask - -if TYPE_CHECKING: - from collections.abc import MutableMapping - import h5py -import zarr + +from anndata._io import zarr +from anndata._io.specs.registry import read_elem_as_dask from ..._core.anndata import AnnData from ...compat import DaskArray @@ -21,6 +18,9 @@ from ._lazy_arrays import CategoricalArray, MaskedArray from ._xarray import Dataset2D +if TYPE_CHECKING: + from collections.abc import MutableMapping + def read_backed( store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, From 2cadd7123b08411fa13f0a166734448001687bab Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 10:50:01 +0200 Subject: [PATCH 122/348] (fix): `zarr` import --- src/anndata/_core/merge.py | 1 + src/anndata/experimental/backed/_io.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 85d5b31ca..ba82d34f6 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -757,6 +757,7 @@ def np_bool_to_pd_bool_array(df: pd.DataFrame): return df +# TODO: concat for xarray def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): arrays = list(arrays) if fill_value is None: diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 8e729ecb4..9e2aff3fc 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -8,7 +8,6 @@ import h5py -from anndata._io import zarr from anndata._io.specs.registry import read_elem_as_dask from ..._core.anndata import AnnData @@ -21,19 +20,23 @@ if TYPE_CHECKING: from collections.abc import MutableMapping + from ...compat import ZarrGroup + def read_backed( - store: str | Path | MutableMapping | zarr.Group | h5py.Dataset, + store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, ) -> AnnData: """Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. - No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. + No array data should need to be read into memory with the exceptio of Awkward Arrays and some older-encoding string arrays. - Args: - store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best + Params + ------ + store: A store-like object to be read in. If :doc:`zarr:index`, it is best for it to be consolidated. - Returns: - AnnData: A lazily read-in AnnData object. + Returns + ------- + A lazily read-in AnnData object. """ is_h5 = False if isinstance(store, Path) or isinstance(store, str): @@ -43,6 +46,8 @@ def read_backed( has_keys = True # true if consolidated or h5ad if not is_h5: + import zarr + try: f = zarr.open_consolidated(store, mode="r") except KeyError: From 33aebb26c6100e74e24124d483b2f47f4b198480 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 11:33:12 +0200 Subject: [PATCH 123/348] (refactor): clean up tests --- tests/test_io_elementwise.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index fa867cac6..750dfa66b 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -245,7 +245,7 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, None), ], ) -def test_read_lazy_nd_dask(store, n_dims, chunks): +def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) @@ -285,11 +285,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): ("csr", None), ], ) -def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): - import dask.distributed as dd - - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] +def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) @@ -302,15 +298,10 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): # assert that sparse chunks are set correctly by default assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE X_from_disk = read_elem(arr_store["X"]) - file.close() - with ( - dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, - dd.Client(cluster) as _client, - ): - assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(X_from_disk, X_dask_from_disk) -def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): +def test_read_lazy_bad_chunk_kwargs(tmp_path): arr_type = "csr" file = h5py.File(tmp_path / "test.h5", "w") store = file["/"] From 701cd8527f7a3d465ab045461f37db644cf4f894 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 12:54:26 +0200 Subject: [PATCH 124/348] (fix): overfetching problem --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index c0597a49f..52d29eb5e 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -45,7 +45,7 @@ def make_block_indexer( ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[0]), + min((block_id[is_csc] * stride) + stride, shape[is_csc]), ) if is_csc: return (slice(None), index1d) @@ -105,7 +105,7 @@ def read_sparse_as_dask( path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem elem_name = get_elem_name(elem) - shape: tuple[int, int] = elem.attrs["shape"] + shape: tuple[int, int] = tuple(elem.attrs["shape"]) dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" @@ -155,7 +155,7 @@ def read_h5_array( path = Path(elem.file.filename) elem_name = elem.name - shape = elem.shape + shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = dataset_kwargs.get( "chunks", (_DEFAULT_STRIDE,) * len(shape) From ad62f8c9f5a1d7439cbb4d36f2751b0163927bbe Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 13:23:17 +0200 Subject: [PATCH 125/348] (fix): don't read in `index` multiple times --- src/anndata/experimental/backed/_io.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 9e2aff3fc..582f64eb7 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -86,11 +86,14 @@ def callback(func, elem_name: str, elem, iospec, dataset_kwargs): d = {k: read_dispatched(v, callback) for k, v in iter_object} d_with_xr = {} index_label = f'{elem_name.replace("/", "")}_names' + index = d[ + elem.attrs["_index"] + ] # no sense in reading this in multiple times for k in d: v = d[k] if type(v) == DaskArray and k != elem.attrs["_index"]: d_with_xr[k] = xr.DataArray( - v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k + v, coords=[index], dims=[index_label], name=k ) elif ( type(v) == CategoricalArray or type(v) == MaskedArray @@ -100,7 +103,7 @@ def callback(func, elem_name: str, elem, iospec, dataset_kwargs): ) d_with_xr[k] = xr.DataArray( variable, - coords=[d[elem.attrs["_index"]]], + coords=[index], dims=[index_label], name=k, ) From d79a6735dd3f05a6208d6bf62510ad4766ffe5a8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 16:14:32 +0200 Subject: [PATCH 126/348] (fix): don't overfetch --- src/anndata/experimental/backed/_io.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 582f64eb7..a850014a9 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -136,7 +136,12 @@ def callback(func, elem_name: str, elem, iospec, dataset_kwargs): "array", "string-array", }: - return read_elem_as_dask(elem) + chunks = None + if "csr_matrix" == iospec.encoding_type: + chunks = (1, elem.attrs["shape"][1]) + elif iospec.encoding_type == "csc_matrix": + chunks = (elem.attrs["shape"][0], 1) + return read_elem_as_dask(elem, chunks=chunks) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) return func(elem) From 43b21a21385465d54fd74fc85ecd03a6e0b15227 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 09:02:14 +0200 Subject: [PATCH 127/348] Fix circular import --- src/anndata/_types.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index b9b0065fd..4853c8f15 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,9 +30,7 @@ from typing import TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer - from .compat import ( - H5File, - ) + from .compat import H5File __all__ = [ "ArrayStorageType", @@ -63,16 +61,12 @@ StorageType = Union[ArrayStorageType, GroupStorageType] ContravariantInMemoryType = TypeVar( - "ContravariantInMemoryType", - bound="InMemoryReadElem", # noqa: F821 - contravariant=True, + "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True ) CovariantInMemoryType = TypeVar( - "CovariantInMemoryType", - bound="InMemoryReadElem", # noqa: F821 - covariant=True, + "CovariantInMemoryType", bound="InMemoryReadElem", covariant=True ) -InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") # noqa: F821 +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") class Read(Protocol[CovariantInMemoryType]): @@ -185,3 +179,8 @@ def __call__( Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... + + +if TYPE_CHECKING: + # Needs to be at the end because Sphinx’s type import suffers from circular imports + from ._io.specs.registry import InMemoryReadElem From 0e22449573ba2ac88d6c9ffe109b47d28d8bc2fb Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 09:18:01 +0200 Subject: [PATCH 128/348] add some typing --- src/anndata/_io/specs/registry.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f61fd9ee3..181848d41 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeAlias + from typing import Any, TypeAlias, TypeVar from anndata._core.storage import StorageType from anndata._types import ( @@ -26,6 +26,9 @@ WriteCallback, ) + T = TypeVar("T") + W = TypeVar("W", bound=Write) + InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, @@ -47,7 +50,7 @@ class IOSpec: class IORegistryError(Exception): @classmethod def _from_write_parts( - cls, dest_type: type, typ: type, modifiers: frozenset[str] + cls, dest_type: type | tuple[type, str], typ: type, modifiers: frozenset[str] ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: @@ -71,7 +74,7 @@ def _from_read_parts( def write_spec(spec: IOSpec): - def decorator(func: Callable): + def decorator(func: W) -> W: @wraps(func) def wrapper(g: GroupStorageType, k: str, *args, **kwargs): result = func(g, k, *args, **kwargs) @@ -99,7 +102,7 @@ def register_write( src_type: type | tuple[type, str], spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ): + ) -> Callable[[Write[T]], Write[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -125,7 +128,7 @@ def get_writer( dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str] = frozenset(), - ): + ) -> Write: import h5py if dest_type is h5py.File: @@ -141,7 +144,7 @@ def has_writer( dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str], - ): + ) -> bool: return (dest_type, src_type, modifiers) in self.write def register_read( @@ -149,7 +152,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ): + ) -> Callable[[Read[T]], Read[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -161,7 +164,7 @@ def _register(func): def get_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() - ): + ) -> Read: if (src_type, spec, modifiers) in self.read: return self.read[(src_type, spec, modifiers)] else: @@ -171,7 +174,7 @@ def get_reader( def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() - ): + ) -> bool: return (src_type, spec, modifiers) in self.read def register_read_partial( From ec546f451067fb88407179f7acb81ae9f3bc56af Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:05:27 +0200 Subject: [PATCH 129/348] fix mapping types --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/methods.py | 109 +++++++++++++++++++------------ src/anndata/_types.py | 9 +-- 3 files changed, 73 insertions(+), 47 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 142acc77d..2cd2fca48 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -47,7 +47,7 @@ def write_h5ad( adata: AnnData, *, as_dense: Sequence[str] = (), - dataset_kwargs: Mapping = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), **kwargs, ) -> None: if isinstance(as_dense, str): diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 3f4e73358..8107d88dd 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -40,7 +40,7 @@ if TYPE_CHECKING: from os import PathLike - from typing import Literal + from typing import Any, Literal from numpy import typing as npt @@ -100,11 +100,13 @@ def wrapper( f, k, cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix, - _writer, *, + _writer: Writer, dataset_kwargs=MappingProxyType, ): - return write_func(f, k, cupy_val.get(), _writer, dataset_kwargs=dataset_kwargs) + return write_func( + f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs + ) return wrapper @@ -120,7 +122,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: H5File | H5Group | H5Array, _reader: Reader + elem: H5File | H5Group | H5Array, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -142,7 +144,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: ZarrGroup | ZarrArray, _reader: Reader + elem: ZarrGroup | ZarrArray, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -263,8 +265,9 @@ def write_anndata( f: GroupStorageType, k: str, adata: AnnData, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) @@ -285,7 +288,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: GroupStorageType | H5File, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -310,8 +313,9 @@ def write_raw( f: GroupStorageType, k: str, raw: Raw, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs) @@ -327,7 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) def read_mapping( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -338,8 +342,9 @@ def write_mapping( f: GroupStorageType, k: str, v: dict, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) for sub_k, sub_v in v.items(): @@ -357,8 +362,9 @@ def write_list( f: GroupStorageType, k: str, elem: list, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): _writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs) @@ -378,8 +384,9 @@ def write_basic( f: GroupStorageType, k: str, elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles natively.""" f.create_dataset(k, data=elem, **dataset_kwargs) @@ -398,8 +405,9 @@ def write_basic_dask_zarr( f: ZarrGroup, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da @@ -414,8 +422,9 @@ def write_basic_dask_h5( f: H5Group, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da import dask.config as dc @@ -432,7 +441,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: ArrayStorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray: return elem[()] @@ -449,7 +458,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d: H5Array, _reader: Reader): +def read_string_array(d: H5Array, *, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -470,8 +479,9 @@ def write_vlen_string_array( f: H5Group, k: str, elem: np.ndarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles nativley.""" str_dtype = h5py.special_dtype(vlen=str) @@ -490,8 +500,9 @@ def write_vlen_string_array_zarr( f: ZarrGroup, k: str, elem: np.ndarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import numcodecs @@ -528,7 +539,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: ArrayStorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -543,8 +554,9 @@ def write_recarray( f: H5Group, k: str, elem: np.ndarray | np.recarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs) @@ -555,8 +567,9 @@ def write_recarray_zarr( f: ZarrGroup, k: str, elem: np.ndarray | np.recarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import _to_fixed_length_strings @@ -572,6 +585,7 @@ def write_sparse_compressed( f: GroupStorageType, key: str, value: sparse.spmatrix | SpArray, + *, _writer: Writer, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}), @@ -632,14 +646,15 @@ def write_sparse_dataset( f: GroupStorageType, k: str, elem: CSCDataset | CSRDataset, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): write_sparse_compressed( f, k, elem._to_backed(), - _writer, + _writer=_writer, fmt=elem.format, dataset_kwargs=dataset_kwargs, ) @@ -664,8 +679,9 @@ def write_dask_sparse( f: GroupStorageType, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): sparse_format = elem._meta.format @@ -712,7 +728,9 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem: GroupStorageType, _reader: Reader) -> sparse.spmatrix | SpArray: +def read_sparse( + elem: GroupStorageType, *, _reader: Reader +) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -741,8 +759,9 @@ def write_awkward( f: GroupStorageType, k: str, v: views.AwkwardArrayView | AwkArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import awkward as ak @@ -756,7 +775,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem: GroupStorageType, _reader: Reader) -> AwkArray: +def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -779,8 +798,9 @@ def write_dataframe( f: GroupStorageType, key: str, df: views.DataFrameView | pd.DataFrame, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Check arguments for reserved in ("_index",): @@ -824,7 +844,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -865,7 +885,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -921,8 +941,9 @@ def write_categorical( f: GroupStorageType, k: str, v: pd.Categorical, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) g.attrs["ordered"] = bool(v.ordered) @@ -935,7 +956,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem: GroupStorageType, _reader: Reader) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -974,8 +995,9 @@ def write_nullable_integer( f: GroupStorageType, k: str, v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) if v._mask is not None: @@ -986,7 +1008,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -999,7 +1021,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1016,7 +1038,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: ArrayStorageType, _reader: Reader) -> np.number: +def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number: return elem[()] @@ -1024,8 +1046,9 @@ def write_scalar( f: GroupStorageType, key: str, value, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): return f.create_dataset(key, data=np.array(value), **dataset_kwargs) @@ -1034,11 +1057,12 @@ def write_hdf5_scalar( f: H5Group, key: str, value, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Can’t compress scalars, error is thrown - dataset_kwargs = dataset_kwargs.copy() + dataset_kwargs = dict(dataset_kwargs) dataset_kwargs.pop("compression", None) dataset_kwargs.pop("compression_opts", None) f.create_dataset(key, data=np.array(value), **dataset_kwargs) @@ -1061,12 +1085,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: H5Array, _reader: Reader) -> str: +def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: ZarrArray, _reader: Reader) -> str: +def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str: return str(elem[()]) @@ -1080,8 +1104,9 @@ def write_string( f: H5Group, k: str, v: np.str_ | str, + *, _writer: Writer, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ): dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 4853c8f15..f53d644dc 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -26,8 +26,8 @@ ) if TYPE_CHECKING: - from types import MappingProxyType - from typing import TypeAlias + from collections.abc import Mapping + from typing import Any, TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer from .compat import H5File @@ -97,8 +97,9 @@ def __call__( f: StorageType, k: str, v: ContravariantInMemoryType, + *, _writer: Writer, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ) -> None: """Low-level writing function for an element. @@ -158,7 +159,7 @@ def __call__( elem: InvariantInMemoryType, *, iospec: IOSpec, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ) -> None: """ Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. From 7c2e4da9d01f60da91d0e3bcbac6e3899a3a120f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:07:35 +0200 Subject: [PATCH 130/348] Fix Read/Write --- docs/api.md | 2 - src/anndata/_io/specs/registry.py | 65 ++++++++++++++++------------ src/anndata/_types.py | 27 +++++++++--- src/anndata/experimental/__init__.py | 9 +--- 4 files changed, 59 insertions(+), 44 deletions(-) diff --git a/docs/api.md b/docs/api.md index a6d92211c..a605afcc3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -142,9 +142,7 @@ Types used by the former: experimental.IOSpec experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType - experimental.Reader experimental.Read - experimental.Writer experimental.Write experimental.ReadCallback experimental.WriteCallback diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 181848d41..d8ece4ef9 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from dataclasses import dataclass -from functools import singledispatch, wraps +from functools import partial, singledispatch, wraps from types import MappingProxyType from typing import TYPE_CHECKING, Union @@ -24,10 +24,12 @@ ReadCallback, Write, WriteCallback, + _ReadInternal, + _WriteInternal, ) T = TypeVar("T") - W = TypeVar("W", bound=Write) + W = TypeVar("W", bound=_WriteInternal) InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], @@ -50,7 +52,7 @@ class IOSpec: class IORegistryError(Exception): @classmethod def _from_write_parts( - cls, dest_type: type | tuple[type, str], typ: type, modifiers: frozenset[str] + cls, dest_type: type, typ: type | tuple[type, str], modifiers: frozenset[str] ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: @@ -89,10 +91,10 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], _ReadInternal] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ - tuple[type, type | tuple[type, str], frozenset[str]], Write + tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal ] = {} self.write_specs: dict[type | tuple[type, str], IOSpec] = {} @@ -102,7 +104,7 @@ def register_write( src_type: type | tuple[type, str], spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[Write[T]], Write[T]]: + ) -> Callable[[_WriteInternal[T]], _WriteInternal[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -123,21 +125,23 @@ def _register(func): return _register - def get_writer( + def get_write_func( self, dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str] = frozenset(), + *, + writer: Writer, ) -> Write: import h5py if dest_type is h5py.File: dest_type = h5py.Group - if (dest_type, src_type, modifiers) in self.write: - return self.write[(dest_type, src_type, modifiers)] - else: + if (dest_type, src_type, modifiers) not in self.write: raise IORegistryError._from_write_parts(dest_type, src_type, modifiers) + internal = self.write[(dest_type, src_type, modifiers)] + return partial(internal, _writer=writer) def has_writer( self, @@ -152,7 +156,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[Read[T]], Read[T]]: + ) -> Callable[[_ReadInternal[T]], _ReadInternal[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -162,15 +166,20 @@ def _register(func): return _register - def get_reader( - self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() + def get_read_func( + self, + src_type: type, + spec: IOSpec, + modifiers: frozenset[str] = frozenset(), + *, + reader: Reader, ) -> Read: - if (src_type, spec, modifiers) in self.read: - return self.read[(src_type, spec, modifiers)] - else: + if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts( "read", _REGISTRY.read, src_type, spec ) + internal = self.read[(src_type, spec, modifiers)] + return partial(internal, _reader=reader) def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() @@ -269,12 +278,10 @@ def read_elem( modifiers: frozenset[str] = frozenset(), ) -> InMemoryReadElem: """Read an element from a store. See exported function for more details.""" - from functools import partial iospec = get_spec(elem) - read_func = partial( - self.registry.get_reader(type(elem), iospec, modifiers), - _reader=self, + read_func = self.registry.get_read_func( + type(elem), iospec, modifiers, reader=self ) if self.callback is None: return read_func(elem) @@ -286,12 +293,18 @@ def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry self.callback = callback - def find_writer(self, dest_type: type, elem, modifiers: frozenset[str]): + def find_write_func( + self, dest_type: type, elem: Any, modifiers: frozenset[str] + ) -> Write: for pattern in _iter_patterns(elem): if self.registry.has_writer(dest_type, pattern, modifiers): - return self.registry.get_writer(dest_type, pattern, modifiers) + return self.registry.get_write_func( + dest_type, pattern, modifiers, writer=self + ) # Raises IORegistryError - return self.registry.get_writer(dest_type, type(elem), modifiers) + return self.registry.get_write_func( + dest_type, type(elem), modifiers, writer=self + ) @report_write_key_on_error def write_elem( @@ -303,7 +316,6 @@ def write_elem( dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), ): - from functools import partial from pathlib import PurePosixPath import h5py @@ -325,10 +337,7 @@ def write_elem( elif k in store: del store[k] - write_func = partial( - self.find_writer(dest_type, elem, modifiers), - _writer=self, - ) + write_func = self.find_write_func(dest_type, elem, modifiers) if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index f53d644dc..cdd6c98ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -69,11 +69,19 @@ InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") -class Read(Protocol[CovariantInMemoryType]): +class _ReadInternal(Protocol[CovariantInMemoryType]): def __call__( self, elem: StorageType | H5File, + *, _reader: Reader, + ) -> CovariantInMemoryType: ... + + +class Read(Protocol[CovariantInMemoryType]): + def __call__( + self, + elem: StorageType | H5File, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -81,8 +89,6 @@ def __call__( ---------- elem The element to read from. - _reader - The :class:`anndata.experimental.Reader` instance. Returns ------- @@ -91,7 +97,7 @@ def __call__( ... -class Write(Protocol[ContravariantInMemoryType]): +class _WriteInternal(Protocol[ContravariantInMemoryType]): def __call__( self, f: StorageType, @@ -100,6 +106,17 @@ def __call__( *, _writer: Writer, dataset_kwargs: Mapping[str, Any], + ) -> None: ... + + +class Write(Protocol[ContravariantInMemoryType]): + def __call__( + self, + f: StorageType, + k: str, + v: ContravariantInMemoryType, + *, + dataset_kwargs: Mapping[str, Any], ) -> None: """Low-level writing function for an element. @@ -111,8 +128,6 @@ def __call__( The key to read in from the group. v The element to write out. - _writer - The :class:`anndata.experimental.Writer` instance. dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 9e3f91191..93bcf54d8 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,12 +4,7 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import ( - InMemoryArrayOrScalarType, - InMemoryReadElem, - Reader, - Writer, -) +from .._io.specs.registry import InMemoryArrayOrScalarType, InMemoryReadElem from .._types import ( Read, ReadCallback, @@ -35,9 +30,7 @@ "CSCDataset", "InMemoryReadElem", "InMemoryArrayOrScalarType", - "Reader", "Read", - "Writer", "Write", "ReadCallback", "WriteCallback", From 1ba5b99eb6f28128d46b689dc75bd8dd98ba9818 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:13:05 +0200 Subject: [PATCH 131/348] Fix one more --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 8107d88dd..a0a840154 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -102,7 +102,7 @@ def wrapper( cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix, *, _writer: Writer, - dataset_kwargs=MappingProxyType, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): return write_func( f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs From 49c0d490456abfe2a66ea5e98b8ba6b0e11c255c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:19:01 +0200 Subject: [PATCH 132/348] unify names --- src/anndata/_io/specs/registry.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d8ece4ef9..72ac18a0b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -125,7 +125,7 @@ def _register(func): return _register - def get_write_func( + def get_write( self, dest_type: type, src_type: type | tuple[type, str], @@ -143,7 +143,7 @@ def get_write_func( internal = self.write[(dest_type, src_type, modifiers)] return partial(internal, _writer=writer) - def has_writer( + def has_write( self, dest_type: type, src_type: type | tuple[type, str], @@ -166,7 +166,7 @@ def _register(func): return _register - def get_read_func( + def get_read( self, src_type: type, spec: IOSpec, @@ -181,7 +181,7 @@ def get_read_func( internal = self.read[(src_type, spec, modifiers)] return partial(internal, _reader=reader) - def has_reader( + def has_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ) -> bool: return (src_type, spec, modifiers) in self.read @@ -201,7 +201,7 @@ def _register(func): return _register - def get_partial_reader( + def get_partial_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ): if (src_type, spec, modifiers) in self.read_partial: @@ -280,9 +280,7 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read_func( - type(elem), iospec, modifiers, reader=self - ) + read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) if self.callback is None: return read_func(elem) return self.callback(read_func, elem.name, elem, iospec=iospec) @@ -297,14 +295,12 @@ def find_write_func( self, dest_type: type, elem: Any, modifiers: frozenset[str] ) -> Write: for pattern in _iter_patterns(elem): - if self.registry.has_writer(dest_type, pattern, modifiers): - return self.registry.get_write_func( + if self.registry.has_write(dest_type, pattern, modifiers): + return self.registry.get_write( dest_type, pattern, modifiers, writer=self ) # Raises IORegistryError - return self.registry.get_write_func( - dest_type, type(elem), modifiers, writer=self - ) + return self.registry.get_write(dest_type, type(elem), modifiers, writer=self) @report_write_key_on_error def write_elem( @@ -402,9 +398,10 @@ def read_elem_partial( modifiers: frozenset[str] = frozenset(), ): """Read part of an element from an on disk store.""" - return _REGISTRY.get_partial_reader( + read_partial = _REGISTRY.get_partial_read( type(elem), get_spec(elem), frozenset(modifiers) - )(elem, items=items, indices=indices) + ) + return read_partial(elem, items=items, indices=indices) @singledispatch From 36667358f4f1af07ea3c87e3628bb7c310683fba Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:47:59 +0200 Subject: [PATCH 133/348] claift ReadCallback signature --- src/anndata/_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index cdd6c98ef..64cc54837 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -141,6 +141,7 @@ def __call__( read_func: Read[InvariantInMemoryType], elem_name: str, elem: StorageType, + *, iospec: IOSpec, ) -> InvariantInMemoryType: """ From 3a332ade227b5d945125b60d69dd34e15a0f7bd9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:59:43 +0200 Subject: [PATCH 134/348] Fix type aliases --- docs/conf.py | 7 +++++-- src/anndata/_types.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 96d94fa58..1d0fc00be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -140,9 +140,12 @@ def setup(app: Sphinx): } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", - InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", - InMemoryType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", + InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + **{ + f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryReadElem`" + for v in ["In", "Co", "Contra"] + }, ) # -- Social cards --------------------------------------------------------- diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 64cc54837..c2eff0d4e 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -60,6 +60,7 @@ GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] +# NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantInMemoryType = TypeVar( "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True ) From d0f4d13c4189261d59bd15ceeb62c71a1e79a2ae Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 12:47:39 +0200 Subject: [PATCH 135/348] (fix): clean up typing to use `RWAble` --- docs/api.md | 6 +++-- docs/conf.py | 7 +++--- src/anndata/_io/specs/methods.py | 10 ++++----- src/anndata/_io/specs/registry.py | 25 ++++++--------------- src/anndata/_types.py | 28 ++++++++++++++---------- src/anndata/experimental/__init__.py | 11 +++++++--- src/anndata/experimental/_dispatch_io.py | 5 +++-- 7 files changed, 48 insertions(+), 44 deletions(-) diff --git a/docs/api.md b/docs/api.md index a605afcc3..fa76d5119 100644 --- a/docs/api.md +++ b/docs/api.md @@ -140,8 +140,10 @@ Types used by the former: :toctree: generated/ experimental.IOSpec - experimental.InMemoryReadElem - experimental.InMemoryArrayOrScalarType + experimental.InMemoryElem + experimental.RWAbleDict + experimental.RWAbleList + experimental.RWAble experimental.Read experimental.Write experimental.ReadCallback diff --git a/docs/conf.py b/docs/conf.py index 1d0fc00be..8b91035dd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -140,10 +140,11 @@ def setup(app: Sphinx): } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", - InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", - InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + RWAble=":data:`~anndata.experimental.RWAble`", + RWAbleDict=":data:`~anndata.experimental.RWAbleDict`", + RWAbleList=":data:`~anndata.experimental.RWAbleList`", **{ - f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryReadElem`" + f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryElem`" for v in ["In", "Co", "Contra"] }, ) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a0a840154..855c6b89f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -48,6 +48,8 @@ ArrayStorageType, GroupStorageType, InMemoryArrayOrScalarType, + RWAbleDict, + RWAbleList, ) from anndata.compat import SpArray @@ -330,9 +332,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping( - elem: GroupStorageType, *, _reader: Reader -) -> dict[str, InMemoryArrayOrScalarType]: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -341,7 +341,7 @@ def read_mapping( def write_mapping( f: GroupStorageType, k: str, - v: dict, + v: RWAbleDict, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -361,7 +361,7 @@ def write_mapping( def write_list( f: GroupStorageType, k: str, - elem: list, + elem: RWAbleList, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 72ac18a0b..4e840fab5 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,22 +4,19 @@ from dataclasses import dataclass from functools import partial, singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING -import pandas as pd - -from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata._types import InMemoryArrayOrScalarType from anndata.compat import _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeAlias, TypeVar + from typing import Any, TypeVar from anndata._core.storage import StorageType from anndata._types import ( GroupStorageType, + InMemoryElem, Read, ReadCallback, Write, @@ -31,14 +28,6 @@ T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) -InMemoryReadElem: TypeAlias = Union[ - dict[str, InMemoryArrayOrScalarType], - InMemoryArrayOrScalarType, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] - # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that? @@ -276,7 +265,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> InMemoryReadElem: + ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) @@ -307,7 +296,7 @@ def write_elem( self, store: GroupStorageType, k: str, - elem: Any, + elem: InMemoryElem, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), @@ -347,7 +336,7 @@ def write_elem( ) -def read_elem(elem: StorageType) -> Any: +def read_elem(elem: StorageType) -> InMemoryElem: """ Read an element from a store. @@ -365,7 +354,7 @@ def read_elem(elem: StorageType) -> Any: def write_elem( store: GroupStorageType, k: str, - elem: Any, + elem: InMemoryElem, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index c2eff0d4e..03afe7a5b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -11,6 +11,8 @@ from numpy.typing import NDArray from scipy import sparse +from anndata._core.anndata import AnnData + from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import ( AwkArray, @@ -55,19 +57,28 @@ np.number, str, ] +RWAble: TypeAlias = InMemoryArrayOrScalarType | "RWAbleDict" | "RWAbleList" # noqa: TCH010 +RWAbleDict: TypeAlias = dict[str, RWAble] +RWAbleList: TypeAlias = list[RWAble] +InMemoryElem: TypeAlias = Union[ + RWAble, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] -ArrayStorageType = Union[ZarrArray, H5Array] -GroupStorageType = Union[ZarrGroup, H5Group] -StorageType = Union[ArrayStorageType, GroupStorageType] +ArrayStorageType: TypeAlias = Union[ZarrArray, H5Array] +GroupStorageType: TypeAlias = Union[ZarrGroup, H5Group] +StorageType: TypeAlias = Union[ArrayStorageType, GroupStorageType] # NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantInMemoryType = TypeVar( - "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True + "ContravariantInMemoryType", bound="InMemoryElem", contravariant=True ) CovariantInMemoryType = TypeVar( - "CovariantInMemoryType", bound="InMemoryReadElem", covariant=True + "CovariantInMemoryType", bound="InMemoryElem", covariant=True ) -InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryElem") class _ReadInternal(Protocol[CovariantInMemoryType]): @@ -197,8 +208,3 @@ def __call__( Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... - - -if TYPE_CHECKING: - # Needs to be at the end because Sphinx’s type import suffers from circular imports - from ._io.specs.registry import InMemoryReadElem diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 93bcf54d8..fc6b8331b 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,10 +4,13 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import InMemoryArrayOrScalarType, InMemoryReadElem from .._types import ( + InMemoryElem, Read, ReadCallback, + RWAble, + RWAbleDict, + RWAbleList, Write, WriteCallback, ) @@ -28,9 +31,11 @@ "sparse_dataset", "CSRDataset", "CSCDataset", - "InMemoryReadElem", - "InMemoryArrayOrScalarType", + "InMemoryElem", "Read", + "RWAbleDict", + "RWAbleList", + "RWAble", "Write", "ReadCallback", "WriteCallback", diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b8950cf77..20b47baeb 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -9,6 +9,7 @@ from anndata._types import ( GroupStorageType, + InMemoryElem, ReadCallback, StorageType, WriteCallback, @@ -18,7 +19,7 @@ def read_dispatched( elem: StorageType, callback: ReadCallback, -) -> Any: +) -> InMemoryElem: """ Read elem, calling the callback at each sub-element. @@ -44,7 +45,7 @@ def read_dispatched( def write_dispatched( store: GroupStorageType, key: str, - elem: Any, + elem: InMemoryElem, callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), From ea29cfa28aa9a2bf2e42d18bf6cf88a771fcf941 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 14:53:43 +0200 Subject: [PATCH 136/348] (fix): use `Union` --- src/anndata/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 03afe7a5b..9863b43ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -57,7 +57,7 @@ np.number, str, ] -RWAble: TypeAlias = InMemoryArrayOrScalarType | "RWAbleDict" | "RWAbleList" # noqa: TCH010 +RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 RWAbleDict: TypeAlias = dict[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ From f4ff2368554d0c8d179f0cc1add6f413d56ba39d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:11:47 +0200 Subject: [PATCH 137/348] (fix): add qualname override --- docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8b91035dd..7bd05af9f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -137,6 +137,9 @@ def setup(app: Sphinx): "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", + "anndata._types.RWAble": "anndata.experimental.RWAble", + "anndata._types.RWAbleDict": "anndata.experimental.RWAbleDict", + "anndata._types.RWAbleList": "anndata.experimental.RWAbleList", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", From f50b286459a672048c4eec51d5b2f1765bbefc96 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:18:51 +0200 Subject: [PATCH 138/348] (fix): ignore dask and masked array --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 7bd05af9f..89727fd6d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -101,6 +101,8 @@ ("py:class", "anndata.compat.DaskArray"), ("py:class", "anndata.compat.CupyArray"), ("py:class", "anndata.compat.CupySparseMatrix"), + ("py:class", "numpy.ma.core.MaskedArray"), + ("py:class", "dask.array.core.Array"), ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), From 712e0856c9c80e3456a111a6ed0de41329adf7f2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:24:22 +0200 Subject: [PATCH 139/348] (fix): ignore erroneous class warning --- docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 89727fd6d..5a47af0f4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -106,6 +106,9 @@ ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), + # Something is picking these up as classes despite being aliases so this just suppresses the warning, but doesn't affect the build + ("py:class", "RWAbleDict"), + ("py:class", "RWAbleList"), ] suppress_warnings = [ "ref.citation", From 24dd18bb8069e1c0632a8fe66e5ffe2eb02c56fa Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:35:22 +0200 Subject: [PATCH 140/348] (fix): upgrade `scanpydoc` --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 17898ec55..43e5ab416 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ doc = [ "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.5", + "scanpydoc[theme,typehints] >=0.13.6", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks From 79d3fdc54c775b88f6ac9c65e83fed08049c5484 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:57:07 +0200 Subject: [PATCH 141/348] (fix): use `MutableMapping` instead of `dict` due to broken docstring --- src/anndata/_types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9863b43ef..9594a2ab8 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,6 +4,7 @@ from __future__ import annotations +from collections.abc import MutableMapping from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np @@ -58,7 +59,8 @@ str, ] RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -RWAbleDict: TypeAlias = dict[str, RWAble] +# dict has a broken docstring: https://readthedocs.com/projects/icb-anndata/builds/2342910/ +RWAbleDict: TypeAlias = MutableMapping[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ RWAble, From d3bcddf8d9bd3c7b6a20bcc1fa380548c2dd0522 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:12:25 +0200 Subject: [PATCH 142/348] Add data docs --- src/anndata/experimental/__init__.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index fc6b8331b..879f39a01 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,21 +4,26 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._types import ( - InMemoryElem, - Read, - ReadCallback, - RWAble, - RWAbleDict, - RWAbleList, - Write, - WriteCallback, -) +from .._types import InMemoryElem as _InMemoryElem +from .._types import Read, ReadCallback, Write, WriteCallback +from .._types import RWAble as _RWAble +from .._types import RWAbleDict as _RWAbleDict +from .._types import RWAbleList as _RWAbleList from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader +# Sphinx can’t find data docstrings when objects are re-exported +InMemoryElem = _InMemoryElem +"""An in-memory element that can be read and written.""" +RWAble = _RWAble +"""A serializable object.""" +RWAbleDict = _RWAbleDict +"""A dict containing serializable objects.""" +RWAbleList = _RWAbleList +"""A list containing serializable objects.""" + __all__ = [ "AnnCollection", "AnnLoader", From 84fdc964bb2ec95bfcd8aee59a4eb4bb36972633 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:13:27 +0200 Subject: [PATCH 143/348] Revert "(fix): use `MutableMapping` instead of `dict` due to broken docstring" This reverts commit 79d3fdc54c775b88f6ac9c65e83fed08049c5484. --- src/anndata/_types.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9594a2ab8..9863b43ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,6 @@ from __future__ import annotations -from collections.abc import MutableMapping from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np @@ -59,8 +58,7 @@ str, ] RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -# dict has a broken docstring: https://readthedocs.com/projects/icb-anndata/builds/2342910/ -RWAbleDict: TypeAlias = MutableMapping[str, RWAble] +RWAbleDict: TypeAlias = dict[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ RWAble, From 2608bc306e4a89662d87a06323ba69103825719e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 16:29:28 +0200 Subject: [PATCH 144/348] (fix): add clarification --- src/anndata/experimental/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 879f39a01..89d462a1f 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -16,13 +16,13 @@ # Sphinx can’t find data docstrings when objects are re-exported InMemoryElem = _InMemoryElem -"""An in-memory element that can be read and written.""" +"""An in-memory element that can be read and written, including an :class:`anndata.AnnData` objects.""" RWAble = _RWAble -"""A serializable object.""" +"""A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" RWAbleDict = _RWAbleDict -"""A dict containing serializable objects.""" +"""A dict containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" RWAbleList = _RWAbleList -"""A list containing serializable objects.""" +"""A list containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns`.""" __all__ = [ "AnnCollection", From e551e18e48ca4e367a6005043529a9557bcc376b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:29:09 +0200 Subject: [PATCH 145/348] Simplify --- docs/api.md | 2 -- docs/conf.py | 7 ------- src/anndata/_io/specs/methods.py | 9 ++++----- src/anndata/_types.py | 6 +++--- src/anndata/experimental/__init__.py | 8 -------- 5 files changed, 7 insertions(+), 25 deletions(-) diff --git a/docs/api.md b/docs/api.md index fa76d5119..36ebeac88 100644 --- a/docs/api.md +++ b/docs/api.md @@ -141,8 +141,6 @@ Types used by the former: experimental.IOSpec experimental.InMemoryElem - experimental.RWAbleDict - experimental.RWAbleList experimental.RWAble experimental.Read experimental.Write diff --git a/docs/conf.py b/docs/conf.py index 5a47af0f4..d83861d13 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -106,9 +106,6 @@ ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), - # Something is picking these up as classes despite being aliases so this just suppresses the warning, but doesn't affect the build - ("py:class", "RWAbleDict"), - ("py:class", "RWAbleList"), ] suppress_warnings = [ "ref.citation", @@ -143,14 +140,10 @@ def setup(app: Sphinx): "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", "anndata._types.RWAble": "anndata.experimental.RWAble", - "anndata._types.RWAbleDict": "anndata.experimental.RWAbleDict", - "anndata._types.RWAbleList": "anndata.experimental.RWAbleList", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", RWAble=":data:`~anndata.experimental.RWAble`", - RWAbleDict=":data:`~anndata.experimental.RWAbleDict`", - RWAbleList=":data:`~anndata.experimental.RWAbleList`", **{ f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryElem`" for v in ["In", "Co", "Contra"] diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 855c6b89f..48106b85d 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -48,8 +48,7 @@ ArrayStorageType, GroupStorageType, InMemoryArrayOrScalarType, - RWAbleDict, - RWAbleList, + RWAble, ) from anndata.compat import SpArray @@ -332,7 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -341,7 +340,7 @@ def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: def write_mapping( f: GroupStorageType, k: str, - v: RWAbleDict, + v: dict[str, RWAble], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -361,7 +360,7 @@ def write_mapping( def write_list( f: GroupStorageType, k: str, - elem: RWAbleList, + elem: list[RWAble], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9863b43ef..e0b663f16 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -57,9 +57,9 @@ np.number, str, ] -RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -RWAbleDict: TypeAlias = dict[str, RWAble] -RWAbleList: TypeAlias = list[RWAble] +RWAble: TypeAlias = Union[ + InMemoryArrayOrScalarType, dict[str, "RWAble"], list["RWAble"] +] # noqa: TCH010 InMemoryElem: TypeAlias = Union[ RWAble, AnnData, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 89d462a1f..904dd5807 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,8 +7,6 @@ from .._types import InMemoryElem as _InMemoryElem from .._types import Read, ReadCallback, Write, WriteCallback from .._types import RWAble as _RWAble -from .._types import RWAbleDict as _RWAbleDict -from .._types import RWAbleList as _RWAbleList from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -19,10 +17,6 @@ """An in-memory element that can be read and written, including an :class:`anndata.AnnData` objects.""" RWAble = _RWAble """A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" -RWAbleDict = _RWAbleDict -"""A dict containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" -RWAbleList = _RWAbleList -"""A list containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns`.""" __all__ = [ "AnnCollection", @@ -38,8 +32,6 @@ "CSCDataset", "InMemoryElem", "Read", - "RWAbleDict", - "RWAbleList", "RWAble", "Write", "ReadCallback", From 1ffe43ecb72bea57e4bbd48c97d64f9eaa3c2540 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 12 Jul 2024 17:51:43 +0200 Subject: [PATCH 146/348] (fix): remove double `dask` intersphinx --- docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index ae1e19cf9..f943fbb60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -128,7 +128,6 @@ def setup(app: Sphinx): scipy=("https://docs.scipy.org/doc/scipy/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), - dask=("https://docs.dask.org/en/stable/", None), xarray=("https://xarray.pydata.org/en/stable/", None), dask=("https://docs.dask.org/en/stable/", None), ) From f9df5bc60acdaf5a28e4596b0f200d3c884fb1ba Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 12 Jul 2024 17:54:23 +0200 Subject: [PATCH 147/348] (fix): remove `_types.DaskArray` from type checking block --- src/anndata/_io/specs/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index e23804919..8c5913850 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -17,7 +17,6 @@ from anndata._core.storage import StorageType from anndata._types import ( - DaskArray, GroupStorageType, InMemoryElem, Read, From a85da39adbe1f783d3d6a1c08497437d8626d4b2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 15 Jul 2024 11:30:52 +0200 Subject: [PATCH 148/348] (refactor): use `block_info` for resolving fetch location --- src/anndata/_io/specs/lazy_methods.py | 52 +++++++++------------------ 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 52d29eb5e..2af600823 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -4,7 +4,7 @@ from functools import singledispatch from pathlib import Path, PurePosixPath from types import MappingProxyType -from typing import TYPE_CHECKING, overload +from typing import TYPE_CHECKING import h5py import numpy as np @@ -17,41 +17,11 @@ if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any, Literal + from typing import Any, Literal, Union from .registry import Reader -@overload -def make_block_indexer( - *, - is_csc: Literal[True], - stride: int, - shape: tuple[int, int], - block_id: tuple[int, int], -) -> tuple[slice, slice]: ... -@overload -def make_block_indexer( - *, - is_csc: Literal[False], - stride: int, - shape: tuple[int, int], - block_id: tuple[int, int], -) -> tuple[slice]: ... - - -def make_block_indexer( - *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] -) -> tuple[slice, slice] | tuple[slice]: - index1d = slice( - block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[is_csc]), - ) - if is_csc: - return (slice(None), index1d) - return (index1d,) - - @contextmanager def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): if not isinstance(path_or_group, Path): @@ -118,13 +88,25 @@ def read_sparse_as_dask( raise ValueError("Only the major axis can be chunked") stride = chunks[int(is_csc)] - def make_dask_chunk(block_id: tuple[int, int]): + def make_dask_chunk( + block_info: Union[ # noqa: UP007 + dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 + ], + None, + ] = None, + ): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 + if block_info is None: + raise ValueError("Block info is required") with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_block_indexer( - is_csc=is_csc, stride=stride, shape=shape, block_id=block_id + array_location = block_info[None]["array-location"] + index = ( + slice(array_location[0][0], array_location[0][1]), + slice(array_location[1][0], array_location[1][1]), ) chunk = mtx[index] return chunk From eb841767861b54aa55553f4daedc9038045a2087 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 15 Jul 2024 13:19:05 +0200 Subject: [PATCH 149/348] (fix): don't set chunk sizes manually --- src/anndata/experimental/backed/_io.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index a850014a9..582f64eb7 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -136,12 +136,7 @@ def callback(func, elem_name: str, elem, iospec, dataset_kwargs): "array", "string-array", }: - chunks = None - if "csr_matrix" == iospec.encoding_type: - chunks = (1, elem.attrs["shape"][1]) - elif iospec.encoding_type == "csc_matrix": - chunks = (elem.attrs["shape"][0], 1) - return read_elem_as_dask(elem, chunks=chunks) + return read_elem_as_dask(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) return func(elem) From 899184f70013c3c43a5970e5c4d078828f344589 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 15 Jul 2024 14:08:20 +0200 Subject: [PATCH 150/348] (fix): dtype for reading --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 2af600823..9349b491c 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -122,7 +122,7 @@ def make_dask_chunk( make_dask_chunk, dtype=dtype, chunks=chunk_layout, - meta=memory_format((0, 0), dtype=np.float32), + meta=memory_format((0, 0), dtype=dtype), ) return da_mtx From efb70ec893d01250e6e527d7fa0bc16a83131f27 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 16 Jul 2024 14:52:37 +0200 Subject: [PATCH 151/348] (fix): ignore import cycle problem (why??) --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index f943fbb60..f59e67f4f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 + "autosummary.import_cycle", # https://readthedocs.com/projects/icb-anndata/builds/2349021/ ] From 118f43c2fe4f948cb6067c14850ecdce9efe94d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 16 Jul 2024 14:59:42 +0200 Subject: [PATCH 152/348] (fix): add issue --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f59e67f4f..f7fe9d1be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,7 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://readthedocs.com/projects/icb-anndata/builds/2349021/ + "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From f742a0a8cbce3cc75f517c0a809964ea02dd834c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 10:00:02 +0200 Subject: [PATCH 153/348] (fix): subclass `Reader` to remove `datasetkwargs` --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 23 ++++++-------------- src/anndata/_io/specs/methods.py | 17 --------------- src/anndata/_io/specs/registry.py | 31 +++++++++++++++++---------- src/anndata/_io/zarr.py | 2 +- src/anndata/experimental/merge.py | 2 +- tests/test_backed_sparse.py | 2 +- tests/test_io_dispatched.py | 26 +++++----------------- 8 files changed, 36 insertions(+), 69 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 653b96f54..2cd2fca48 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -233,7 +233,7 @@ def read_h5ad( with h5py.File(filename, "r") as f: - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 9349b491c..f5bb2173c 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -3,7 +3,6 @@ from contextlib import contextmanager from functools import singledispatch from pathlib import Path, PurePosixPath -from types import MappingProxyType from typing import TYPE_CHECKING import h5py @@ -16,8 +15,7 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Mapping - from typing import Any, Literal, Union + from typing import Literal, Union from .registry import Reader @@ -67,9 +65,7 @@ def _(x): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( - elem: H5Group | ZarrGroup, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None ): import dask.array as da @@ -79,7 +75,6 @@ def read_sparse_as_dask( dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" - chunks = dataset_kwargs.get("chunks", None) stride: int = _DEFAULT_STRIDE if chunks is not None: if len(chunks) != 2: @@ -129,9 +124,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None ): import dask.array as da @@ -139,8 +132,8 @@ def read_h5_array( elem_name = elem.name shape = tuple(elem.shape) dtype = elem.dtype - chunks: tuple[int, ...] = dataset_kwargs.get( - "chunks", (_DEFAULT_STRIDE,) * len(shape) + chunks: tuple[int, ...] = ( + chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) def make_dask_chunk(block_id: tuple[int, int]): @@ -166,11 +159,9 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None ): - chunks: tuple[int, ...] = dataset_kwargs.get("chunks", elem.chunks) + chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da return da.from_zarr(elem, chunks=chunks) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e04534c71..719c9975d 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -126,7 +126,6 @@ def read_basic( elem: H5File | H5Group | H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -151,7 +150,6 @@ def read_basic_zarr( elem: ZarrGroup | ZarrArray, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -299,7 +297,6 @@ def read_anndata( elem: GroupStorageType | H5File, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> AnnData: d = {} for k in [ @@ -346,7 +343,6 @@ def read_mapping( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -460,7 +456,6 @@ def read_array( elem: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> npt.NDArray: return elem[()] @@ -482,7 +477,6 @@ def read_string_array( d: H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ): return read_array(d.asstr(), _reader=_reader) @@ -568,7 +562,6 @@ def read_recarray( d: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype @@ -785,7 +778,6 @@ def read_sparse( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -835,7 +827,6 @@ def read_awkward( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> AwkArray: from anndata.compat import awkward as ak @@ -909,7 +900,6 @@ def read_dataframe( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") @@ -955,7 +945,6 @@ def read_dataframe_0_1_0( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") @@ -1031,7 +1020,6 @@ def read_categorical( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), @@ -1087,7 +1075,6 @@ def read_nullable_integer( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -1103,7 +1090,6 @@ def read_nullable_boolean( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1124,7 +1110,6 @@ def read_scalar( elem: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> np.number: return elem[()] @@ -1176,7 +1161,6 @@ def read_hdf5_string( elem: H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> str: return elem.asstr()[()] @@ -1186,7 +1170,6 @@ def read_zarr_string( elem: ZarrArray, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> str: return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 8c5913850..62ba5564d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -1,6 +1,5 @@ from __future__ import annotations -import inspect import warnings from collections.abc import Mapping from dataclasses import dataclass @@ -275,16 +274,28 @@ def read_elem( iospec = get_spec(elem) read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) if self.callback is None: - return read_func(elem, dataset_kwargs=dataset_kwargs) - if "dataset_kwargs" not in inspect.getfullargspec(self.callback)[0]: + return read_func(elem) + return self.callback(read_func, elem.name, elem, iospec=iospec) + + +class DaskReader(Reader): + @report_read_key_on_error + def read_elem( + self, + elem: StorageType, + modifiers: frozenset[str] = frozenset(), + chunks: tuple[int, ...] | None = None, + ) -> InMemoryElem: + """Read an element from a store. See exported function for more details.""" + + iospec = get_spec(elem) + read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + if self.callback is None: warnings.warn( - "Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + "Dask reading does not use a callback. Ignoring callback.", stacklevel=2, ) - return self.callback(read_func, elem.name, elem, iospec=iospec) - return self.callback( - read_func, elem.name, elem, dataset_kwargs=dataset_kwargs, iospec=iospec - ) + return read_func(elem, chunks=chunks) class Writer: @@ -385,9 +396,7 @@ def read_elem_as_dask( ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem( - elem, dataset_kwargs={"chunks": chunks} if chunks is not None else {} - ) + return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) def write_elem( diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 9d6f759ff..0e015244a 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -66,7 +66,7 @@ def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: f = zarr.open(store, mode="r") # Read with handling for backwards compat - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 8882f3c63..9690420ec 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -134,7 +134,7 @@ def read_as_backed(group: ZarrGroup | H5Group): BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type in SPARSE_MATRIX: return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index f5e593273..4e5f5445d 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -70,7 +70,7 @@ def read_zarr_backed(path): f = zarr.open(path, mode="r") # Read with handling for backwards compat - def callback(func, elem_name, elem, iospec, dataset_kwargs): + def callback(func, elem_name, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{k: read_dispatched(v, callback) for k, v in elem.items()} diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 395e942c3..76e17ad2d 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -3,7 +3,6 @@ import re import h5py -import pytest import zarr from scipy import sparse @@ -19,7 +18,7 @@ def test_read_dispatched_w_regex(): - def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): + def read_only_axis_dfs(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata": return func(elem) elif re.match(r"^/((obs)|(var))?(/.*)?$", elem_name): @@ -41,7 +40,7 @@ def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): def test_read_dispatched_dask(): import dask.array as da - def read_as_dask_array(func, elem_name: str, elem, iospec, dataset_kwargs): + def read_as_dask_array(func, elem_name: str, elem, iospec): if iospec.encoding_type in { "dataframe", "csr_matrix", @@ -78,7 +77,7 @@ def test_read_dispatched_null_case(): expected = read_elem(z) - def callback(read_func, elem_name, x, dataset_kwargs, iospec): + def callback(read_func, elem_name, x, iospec): return read_elem(x) actual = read_dispatched(z, callback) @@ -86,21 +85,6 @@ def callback(read_func, elem_name, x, dataset_kwargs, iospec): assert_equal(expected, actual) -def test_read_dispatched_warns_with_no_dataset_kwargs(): - adata = gen_adata((100, 100)) - z = zarr.group() - write_elem(z, "/", adata) - - def callback(read_func, elem_name, x, iospec): - return read_elem(x) - - with pytest.warns( - UserWarning, - match="Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", - ): - read_dispatched(z, callback) - - def test_write_dispatched_chunks(): from itertools import chain, repeat @@ -182,11 +166,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, dataset_kwargs, iospec): + def h5ad_reader(func, elem_name: str, elem, iospec): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, dataset_kwargs, iospec): + def zarr_reader(func, elem_name: str, elem, iospec): zarr_read_keys.append(elem_name) return func(elem) From ae68731385759feae5510dfe66b82688977f5ea2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 10:00:31 +0200 Subject: [PATCH 154/348] (fix): add message tp errpr --- src/anndata/_io/specs/lazy_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f5bb2173c..8fc251b98 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -80,7 +80,10 @@ def read_sparse_as_dask( if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") if chunks[int(not is_csc)] != shape[int(not is_csc)]: - raise ValueError("Only the major axis can be chunked") + raise ValueError( + "Only the major axis can be chunked. " + f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" + ) stride = chunks[int(is_csc)] def make_dask_chunk( From f5e7760aba721c17b952f031c20e8b8a40a4a045 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 18 Jul 2024 10:01:03 +0200 Subject: [PATCH 155/348] Update tests/test_io_elementwise.py Co-authored-by: Isaac Virshup --- tests/test_io_elementwise.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index ac67dd215..80da79014 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -272,12 +272,11 @@ def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): def test_read_lazy_h5_cluster(sparse_format, tmp_path): import dask.distributed as dd - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] - arr_store = create_sparse_store(sparse_format, store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"]) - X_from_disk = read_elem(arr_store["X"]) - file.close() + with h5py.File(tmp_path / "test.h5", "w") as file: + store = file["/"] + arr_store = create_sparse_store(sparse_format, store) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) with ( dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, dd.Client(cluster) as _client, From 96b13a34645f249348df93aea1fc91b02e8365a2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:00:11 +0200 Subject: [PATCH 156/348] (fix): correct `self.callback` check --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 62ba5564d..7ad43fe3c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -290,7 +290,7 @@ def read_elem( iospec = get_spec(elem) read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) - if self.callback is None: + if self.callback is not None: warnings.warn( "Dask reading does not use a callback. Ignoring callback.", stacklevel=2, From 9c68e365414bc4f605db21c38167681fcf3e32b6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:22:07 +0200 Subject: [PATCH 157/348] (fix): erroneous diffs --- src/anndata/_io/specs/methods.py | 92 ++++++-------------------------- 1 file changed, 17 insertions(+), 75 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 719c9975d..1bf2d13a3 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -123,9 +123,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: H5File | H5Group | H5Array, - *, - _reader: Reader, + elem: H5File | H5Group | H5Array, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -147,9 +145,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: ZarrGroup | ZarrArray, - *, - _reader: Reader, + elem: ZarrGroup | ZarrArray, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -293,11 +289,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata( - elem: GroupStorageType | H5File, - *, - _reader: Reader, -) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -339,11 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping( - elem: GroupStorageType, - *, - _reader: Reader, -) -> dict[str, RWAble]: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -452,11 +440,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array( - elem: ArrayStorageType, - *, - _reader: Reader, -) -> npt.NDArray: +def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray: return elem[()] @@ -473,11 +457,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array( - d: H5Array, - *, - _reader: Reader, -): +def read_string_array(d: H5Array, *, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -558,11 +538,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray( - d: ArrayStorageType, - *, - _reader: Reader, -) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -775,9 +751,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -823,11 +797,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward( - elem: GroupStorageType, - *, - _reader: Reader, -) -> AwkArray: +def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -896,11 +866,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -941,11 +907,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -1016,11 +978,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -1072,9 +1030,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -1087,9 +1043,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1106,11 +1060,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar( - elem: ArrayStorageType, - *, - _reader: Reader, -) -> np.number: +def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number: return elem[()] @@ -1157,20 +1107,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string( - elem: H5Array, - *, - _reader: Reader, -) -> str: +def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string( - elem: ZarrArray, - *, - _reader: Reader, -) -> str: +def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str: return str(elem[()]) From 410aeda2df1841bfb85bfc74233b6620137ced9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:22:48 +0200 Subject: [PATCH 158/348] (fix): extra `read_elem` `dataset_kwargs` --- src/anndata/_io/specs/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 7ad43fe3c..c749c5d0b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,7 +267,6 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" From 31a30c4327433bfba354e913fcfed8e859a840eb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:26:20 +0200 Subject: [PATCH 159/348] (fix): remove more `dataset_kwargs` nonsense --- src/anndata/_types.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 4b45e54e7..9fbcf57b2 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -29,7 +29,6 @@ if TYPE_CHECKING: from collections.abc import Mapping - from types import MappingProxyType from typing import Any, TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer @@ -95,8 +94,6 @@ class Read(Protocol[CovariantInMemoryType]): def __call__( self, elem: StorageType | H5File, - *, - dataset_kwargs: MappingProxyType, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -104,9 +101,6 @@ def __call__( ---------- elem The element to read from. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. - Returns ------- The element read from the store. @@ -146,7 +140,7 @@ def __call__( v The element to write out. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... @@ -160,7 +154,6 @@ def __call__( elem: StorageType, *, iospec: IOSpec, - dataset_kwargs: MappingProxyType, ) -> InvariantInMemoryType: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. @@ -175,8 +168,6 @@ def __call__( The element to read from. iospec Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- From 80fe8cb32c7c487e252cf7338f6b19deeceeb981 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:27:48 +0200 Subject: [PATCH 160/348] (chore): add docs --- docs/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api.md b/docs/api.md index 36ebeac88..92139fe06 100644 --- a/docs/api.md +++ b/docs/api.md @@ -121,6 +121,7 @@ Low level methods for reading and writing elements of an `AnnData` object to a s experimental.read_elem experimental.write_elem + experimental.read_elem_as_dask ``` Utilities for customizing the IO process: From b3142487bdd87f579e9be75f2a2aab80b21b4e91 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:35:44 +0200 Subject: [PATCH 161/348] (fix): use `block_info` for dense --- src/anndata/_io/specs/lazy_methods.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 8fc251b98..823254bac 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -139,13 +139,22 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - def make_dask_chunk(block_id: tuple[int, int]): + def make_dask_chunk( + block_info: Union[ # noqa: UP007 + dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 + ], + None, + ] = None, + ): + if block_info is None: + raise ValueError("Block info is required") with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): - start = block_id[i] * chunks[i] - stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) - idx += (slice(start, stop),) + array_location = block_info[None]["array-location"][i] + idx += (slice(array_location[0], array_location[1]),) return f[idx] chunk_layout = tuple( From 02d47352110806d6a605bfdc1599d1243a941cf9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:39:06 +0200 Subject: [PATCH 162/348] (fix): more erroneous diffs --- tests/test_io_dispatched.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 76e17ad2d..833b23e83 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,11 +76,7 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - - def callback(read_func, elem_name, x, iospec): - return read_elem(x) - - actual = read_dispatched(z, callback) + actual = read_dispatched(z, lambda _, __, x, **___: read_elem(x)) assert_equal(expected, actual) From 6e5534a639d59c404a434c5d07144fad5184689d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:44:07 +0200 Subject: [PATCH 163/348] (fix): use context again --- tests/test_io_elementwise.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 80da79014..5dd1791d1 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -312,13 +312,15 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): def test_read_lazy_bad_chunk_kwargs(tmp_path): arr_type = "csr" - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] - arr_store = create_sparse_store(arr_type, store) - with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): - read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) - with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): - read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) + with h5py.File(tmp_path / "test.h5", "w") as file: + store = file["/"] + arr_store = create_sparse_store(arr_type, store) + with pytest.raises( + ValueError, match=r"`chunks` must be a tuple of two integers" + ): + read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) + with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): + read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From d26cfe81a91639e261ec9f64048e60717fbb47d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 11:32:27 +0200 Subject: [PATCH 164/348] (fix): change size by dimension in tests --- tests/test_io_elementwise.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 5dd1791d1..62284a0c9 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -74,13 +74,15 @@ def sparse_format(request): def create_dense_store(store, n_dims: int = 2): - X = np.random.randn(*((SIZE,) * n_dims)) + X = np.random.randn(*[SIZE * (i + 1) for i in range(n_dims)]) write_elem(store, "X", X) return store -def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: +def create_sparse_store( + sparse_format: Literal["csc", "csr"], store: G, shape=(SIZE, SIZE * 2) +) -> G: """Returns a store Parameters @@ -95,14 +97,15 @@ def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: import dask.array as da X = sparse.random( - SIZE, - SIZE, + shape[0], + shape[1], format=sparse_format, density=0.01, random_state=np.random.default_rng(), ) X_dask = da.from_array( - X, chunks=(100 if format == "csr" else SIZE, SIZE if format == "csr" else 100) + X, + chunks=(100 if format == "csr" else SIZE, SIZE * 2 if format == "csr" else 100), ) write_elem(store, "X", X) @@ -233,11 +236,18 @@ def test_read_lazy_2d_dask(sparse_format, store): assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) random_int_indices.sort() - random_bool_mask = np.random.randn(SIZE) > 0 index_slice = slice(0, SIZE // 10) - for index in [random_int_indices, index_slice, random_bool_mask]: + for index in [random_int_indices, index_slice]: assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) + random_bool_mask = np.random.randn(SIZE) > 0 + assert_equal( + X_from_disk[random_bool_mask, :], X_dask_from_disk[random_bool_mask, :] + ) + random_bool_mask = np.random.randn(SIZE * 2) > 0 + assert_equal( + X_from_disk[:, random_bool_mask], X_dask_from_disk[:, random_bool_mask] + ) assert arr_store["X_dask/indptr"].dtype == np.int64 assert arr_store["X_dask/indices"].dtype == np.int64 @@ -289,7 +299,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): [ ("dense", (100, 100)), ("csc", (SIZE, 10)), - ("csr", (10, SIZE)), + ("csr", (10, SIZE * 2)), ("csc", None), ("csr", None), ], @@ -304,8 +314,9 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): if chunks is not None: assert X_dask_from_disk.chunksize == chunks else: + minor_index = int(arr_type == "csr") # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE + assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From 94e43a33bff09af3a4ef01d09a7ef9f287934d4e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:23:24 +0200 Subject: [PATCH 165/348] (refactor): clean up `get_elem_name` --- src/anndata/_core/file_backing.py | 17 ++++++++++++++++- src/anndata/_io/specs/lazy_methods.py | 21 +++------------------ src/anndata/_io/specs/registry.py | 18 ------------------ 3 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/anndata/_core/file_backing.py b/src/anndata/_core/file_backing.py index d283a1dfd..f7dcae8b1 100644 --- a/src/anndata/_core/file_backing.py +++ b/src/anndata/_core/file_backing.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from functools import singledispatch -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING import h5py @@ -161,3 +161,18 @@ def _(x): @filename.register(ZarrGroup) def _(x): return x.store.path + + +@singledispatch +def get_elem_name(x): + raise NotImplementedError(f"Not implemented for {type(x)}") + + +@get_elem_name.register(h5py.Group) +def _(x): + return x.name + + +@get_elem_name.register(ZarrGroup) +def _(x): + return PurePosixPath(x.path).name diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 823254bac..da457f0ea 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,8 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from functools import singledispatch -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import TYPE_CHECKING import h5py @@ -10,6 +9,7 @@ from scipy import sparse import anndata as ad +from anndata._core.file_backing import filename, get_elem_name from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec @@ -45,21 +45,6 @@ def compute_chunk_layout_for_axis_shape( return chunk -@singledispatch -def get_elem_name(x): - raise NotImplementedError(f"Not implemented for {type(x)}") - - -@get_elem_name.register(H5Group) -def _(x): - return x.name - - -@get_elem_name.register(ZarrGroup) -def _(x): - return PurePosixPath(x.path).name - - @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -69,7 +54,7 @@ def read_sparse_as_dask( ): import dask.array as da - path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem + path_or_group = Path(filename(elem)) if isinstance(elem, H5Group) else elem elem_name = get_elem_name(elem) shape: tuple[int, int] = tuple(elem.attrs["shape"]) dtype = elem["data"].dtype diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index c749c5d0b..1ca54b5ce 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -438,21 +438,3 @@ def read_elem_partial( type(elem), get_spec(elem), frozenset(modifiers) ) return read_partial(elem, items=items, indices=indices) - - -@singledispatch -def elem_key(elem) -> str: - return elem.name - - -# raise NotImplementedError() - -# @elem_key.register(ZarrGroup) -# @elem_key.register(ZarrArray) -# def _(elem): -# return elem.name - -# @elem_key.register(H5Array) -# @elem_key.register(H5Group) -# def _(elem): -# re From 51600168693daa98ed33515e5f88c05edbcbd9f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:28:36 +0200 Subject: [PATCH 166/348] (fix): try new sphinx for error --- docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f7fe9d1be..f943fbb60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,7 +110,6 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From 43da9a3e18e9eb4aed2871f59a47b5b0aa810a46 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:32:11 +0200 Subject: [PATCH 167/348] (fix): return type --- src/anndata/_io/specs/lazy_methods.py | 8 +++++--- src/anndata/_io/specs/registry.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index da457f0ea..ba5331de2 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -17,6 +17,8 @@ if TYPE_CHECKING: from typing import Literal, Union + from anndata.compat import DaskArray + from .registry import Reader @@ -51,7 +53,7 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: import dask.array as da path_or_group = Path(filename(elem)) if isinstance(elem, H5Group) else elem @@ -113,7 +115,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: import dask.array as da path = Path(elem.file.filename) @@ -157,7 +159,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 1ca54b5ce..822c89e4b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -284,7 +284,7 @@ def read_elem( elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, - ) -> InMemoryElem: + ) -> DaskArray: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) From 9735ced2b618472f66174bc122629109ff26e615 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:41:24 +0200 Subject: [PATCH 168/348] (fix): protocol for reading --- src/anndata/_io/specs/lazy_methods.py | 10 ++++++---- src/anndata/_io/specs/registry.py | 15 +++++++++++---- src/anndata/_types.py | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index ba5331de2..dd99c46ba 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,7 +19,7 @@ from anndata.compat import DaskArray - from .registry import Reader + from .registry import DaskReader @contextmanager @@ -52,7 +52,9 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( - elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: H5Group | ZarrGroup, + _reader: DaskReader, + chunks: tuple[int, ...] | None = None, ) -> DaskArray: import dask.array as da @@ -114,7 +116,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: H5Array, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -158,7 +160,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 822c89e4b..faf6ff9ba 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -20,6 +20,7 @@ InMemoryElem, Read, ReadCallback, + ReadDask, Write, WriteCallback, _ReadInternal, @@ -81,7 +82,9 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], _ReadInternal] = {} + self.read: dict[ + tuple[type, IOSpec, frozenset[str]], _ReadInternal | ReadDask + ] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal @@ -163,7 +166,7 @@ def get_read( modifiers: frozenset[str] = frozenset(), *, reader: Reader, - ) -> Read: + ) -> Read | ReadDask: if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts("read", self.read, src_type, spec) internal = self.read[(src_type, spec, modifiers)] @@ -271,7 +274,9 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + read_func: Read = self.registry.get_read( + type(elem), iospec, modifiers, reader=self + ) if self.callback is None: return read_func(elem) return self.callback(read_func, elem.name, elem, iospec=iospec) @@ -288,7 +293,9 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + read_func: ReadDask = self.registry.get_read( + type(elem), iospec, modifiers, reader=self + ) if self.callback is not None: warnings.warn( "Dask reading does not use a callback. Ignoring callback.", diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9fbcf57b2..f091b701a 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -31,6 +31,8 @@ from collections.abc import Mapping from typing import Any, TypeAlias + from anndata._io.specs.registry import DaskReader + from ._io.specs.registry import IOSpec, Reader, Writer from .compat import H5File @@ -108,6 +110,31 @@ def __call__( ... +class ReadDask(Protocol): + def __call__( + self, + elem: StorageType | H5File, + *, + _reader: DaskReader, + chunks: tuple[int, ...] | None = None, + ) -> DaskArray: + """Low-level reading function for a dask element. + + Parameters + ---------- + elem + The element to read from. + _reader + The parent object that will be used to read the element. + chunks + The chunks size to be used. + Returns + ------- + The dask element read from the store. + """ + ... + + class _WriteInternal(Protocol[ContravariantInMemoryType]): def __call__( self, From f1730c3a9938cfb9f44c53e1573888821885282a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 13:21:46 +0200 Subject: [PATCH 169/348] (fix): bring back ignored warning --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index f943fbb60..f7fe9d1be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 + "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From 9861b56771af30bd747f39d160dd0beebf77dec5 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 14:00:03 +0200 Subject: [PATCH 170/348] Fix docs --- docs/_templates/autosummary/class.rst | 4 ++-- docs/conf.py | 3 +-- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index b4e7370aa..8fe1d69d0 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -13,7 +13,7 @@ .. autosummary:: :toctree: . {% for item in attributes %} - ~{{ fullname }}.{{ item }} + ~{{ name }}.{{ item }} {%- endfor %} {% endif %} {% endblock %} @@ -26,7 +26,7 @@ :toctree: . {% for item in methods %} {%- if item != '__init__' %} - ~{{ fullname }}.{{ item }} + ~{{ name }}.{{ item }} {%- endif -%} {%- endfor %} {% endif %} diff --git a/docs/conf.py b/docs/conf.py index f7fe9d1be..5b1b95f30 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,7 +27,7 @@ # default settings templates_path = ["_templates"] html_static_path = ["_static"] -source_suffix = [".rst", ".md"] +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} master_doc = "index" default_role = "literal" exclude_patterns = [ @@ -110,7 +110,6 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] diff --git a/pyproject.toml b/pyproject.toml index 43e5ab416..ef97699f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ dev = [ "pytest-xdist", ] doc = [ - "sphinx>=4.4", + "sphinx>=7.4.6", "sphinx-book-theme>=1.1.0", "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", From 235096a9fdb2bb983b456d676d5309b0ba560a2c Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:03:45 +0200 Subject: [PATCH 171/348] almost fix typing --- src/anndata/_io/specs/lazy_methods.py | 5 ++-- src/anndata/_io/specs/registry.py | 34 +++++++++++------------ src/anndata/_types.py | 40 ++++++++++++--------------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index dd99c46ba..0b9def583 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -53,6 +53,7 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, + *, _reader: DaskReader, chunks: tuple[int, ...] | None = None, ) -> DaskArray: @@ -116,7 +117,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: H5Array, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -160,7 +161,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index faf6ff9ba..e3003cc52 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -5,25 +5,23 @@ from dataclasses import dataclass from functools import partial, singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Generic, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error +from anndata._types import Read, ReadDask, _ReadDaskInternal, _ReadInternal from anndata.compat import DaskArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeVar + from typing import Any from anndata._core.storage import StorageType from anndata._types import ( GroupStorageType, InMemoryElem, - Read, ReadCallback, - ReadDask, Write, WriteCallback, - _ReadInternal, _WriteInternal, ) @@ -80,11 +78,13 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class IORegistry: +_R = TypeVar("_R", _ReadInternal, _ReadDaskInternal) +R = TypeVar("R", Read, ReadDask) + + +class IORegistry(Generic[_R, R]): def __init__(self): - self.read: dict[ - tuple[type, IOSpec, frozenset[str]], _ReadInternal | ReadDask - ] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], _R] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal @@ -149,7 +149,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[_ReadInternal[T]], _ReadInternal[T]]: + ) -> Callable[[_R], _R]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -166,7 +166,7 @@ def get_read( modifiers: frozenset[str] = frozenset(), *, reader: Reader, - ) -> Read | ReadDask: + ) -> R: if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts("read", self.read, src_type, spec) internal = self.read[(src_type, spec, modifiers)] @@ -212,8 +212,8 @@ def get_spec(self, elem: Any) -> IOSpec: return self.write_specs[type(elem)] -_REGISTRY = IORegistry() -_LAZY_REGISTRY = IORegistry() +_REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry() +_LAZY_REGISTRY: IORegistry[_ReadDaskInternal, ReadDask] = IORegistry() @singledispatch @@ -290,17 +290,15 @@ def read_elem( modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, ) -> DaskArray: - """Read an element from a store. See exported function for more details.""" + """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) read_func: ReadDask = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is not None: - warnings.warn( - "Dask reading does not use a callback. Ignoring callback.", - stacklevel=2, - ) + msg = "Dask reading does not use a callback. Ignoring callback." + warnings.warn(msg, stacklevel=2) return read_func(elem, chunks=chunks) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index f091b701a..3549152f5 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -34,7 +34,6 @@ from anndata._io.specs.registry import DaskReader from ._io.specs.registry import IOSpec, Reader, Writer - from .compat import H5File __all__ = [ "ArrayStorageType", @@ -82,21 +81,22 @@ ) InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryElem") +SCo = TypeVar("SCo", covariant=True, bound=StorageType) +SCon = TypeVar("SCon", contravariant=True, bound=StorageType) -class _ReadInternal(Protocol[CovariantInMemoryType]): - def __call__( - self, - elem: StorageType | H5File, - *, - _reader: Reader, - ) -> CovariantInMemoryType: ... +class _ReadInternal(Protocol[SCon, CovariantInMemoryType]): + def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantInMemoryType: ... -class Read(Protocol[CovariantInMemoryType]): + +class _ReadDaskInternal(Protocol[SCon]): def __call__( - self, - elem: StorageType | H5File, - ) -> CovariantInMemoryType: + self, elem: SCon, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None + ) -> DaskArray: ... + + +class Read(Protocol[SCon, CovariantInMemoryType]): + def __call__(self, elem: SCon) -> CovariantInMemoryType: """Low-level reading function for an element. Parameters @@ -110,13 +110,9 @@ def __call__( ... -class ReadDask(Protocol): +class ReadDask(Protocol[SCon]): def __call__( - self, - elem: StorageType | H5File, - *, - _reader: DaskReader, - chunks: tuple[int, ...] | None = None, + self, elem: SCon, *, chunks: tuple[int, ...] | None = None ) -> DaskArray: """Low-level reading function for a dask element. @@ -124,10 +120,8 @@ def __call__( ---------- elem The element to read from. - _reader - The parent object that will be used to read the element. chunks - The chunks size to be used. + The chunk size to be used. Returns ------- The dask element read from the store. @@ -172,11 +166,11 @@ def __call__( ... -class ReadCallback(Protocol[InvariantInMemoryType]): +class ReadCallback(Protocol[SCo, InvariantInMemoryType]): def __call__( self, /, - read_func: Read[InvariantInMemoryType], + read_func: Read[SCo, InvariantInMemoryType], elem_name: str, elem: StorageType, *, From dce9f07a271316a47498c63b3f4c11ca12e2810b Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:26:42 +0200 Subject: [PATCH 172/348] add wrapper --- src/anndata/_io/specs/lazy_methods.py | 61 ++++++++++++++------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 0b9def583..33c7aba6b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,8 +1,9 @@ from __future__ import annotations from contextlib import contextmanager +from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, Union import h5py import numpy as np @@ -15,7 +16,8 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from typing import Literal, Union + from collections.abc import Callable + from typing import Concatenate from anndata.compat import DaskArray @@ -47,6 +49,29 @@ def compute_chunk_layout_for_axis_shape( return chunk +P = ParamSpec("P") +R = TypeVar("R") +BlockInfo = dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], +] + + +def require_block_info( + f: Callable[Concatenate[BlockInfo, P], R], +) -> Callable[Concatenate[BlockInfo | None, P], R]: + @wraps(f) + def wrapper( + block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs + ) -> R: + if block_info is None: + msg = "Block info is required" + raise ValueError(msg) + return f(block_info, *args, **kwargs) + + return wrapper + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -76,19 +101,10 @@ def read_sparse_as_dask( ) stride = chunks[int(is_csc)] - def make_dask_chunk( - block_info: Union[ # noqa: UP007 - dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 - ], - None, - ] = None, - ): + @require_block_info + def make_dask_chunk(block_info: BlockInfo): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - if block_info is None: - raise ValueError("Block info is required") with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) array_location = block_info[None]["array-location"] @@ -129,17 +145,8 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - def make_dask_chunk( - block_info: Union[ # noqa: UP007 - dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 - ], - None, - ] = None, - ): - if block_info is None: - raise ValueError("Block info is required") + @require_block_info + def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): @@ -152,11 +159,7 @@ def make_dask_chunk( for i in range(len(shape)) ) - return da.map_blocks( - make_dask_chunk, - dtype=dtype, - chunks=chunk_layout, - ) + return da.map_blocks(make_dask_chunk, dtype=dtype, chunks=chunk_layout) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) From 2725ef2a1462abe7cb7096bcb4c50486d916618a Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:29:58 +0200 Subject: [PATCH 173/348] move into type checking --- src/anndata/_io/specs/lazy_methods.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 33c7aba6b..85e6fc25b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, Union +from typing import TYPE_CHECKING import h5py import numpy as np @@ -17,12 +17,20 @@ if TYPE_CHECKING: from collections.abc import Callable - from typing import Concatenate + from typing import Concatenate, Literal, ParamSpec, TypeVar from anndata.compat import DaskArray from .registry import DaskReader + BlockInfo = dict[ + Literal[None], + dict[str, tuple[int, ...] | list[tuple[int, ...]]], + ] + + P = ParamSpec("P") + R = TypeVar("R") + @contextmanager def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): @@ -49,14 +57,6 @@ def compute_chunk_layout_for_axis_shape( return chunk -P = ParamSpec("P") -R = TypeVar("R") -BlockInfo = dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], -] - - def require_block_info( f: Callable[Concatenate[BlockInfo, P], R], ) -> Callable[Concatenate[BlockInfo | None, P], R]: From ffe89f0b5518a51dd9506e11a2308bff5ec940c7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 16:38:10 +0200 Subject: [PATCH 174/348] (fix): small type fxes --- src/anndata/_io/specs/lazy_methods.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 85e6fc25b..f51c0f684 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -80,7 +80,7 @@ def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, _reader: DaskReader, - chunks: tuple[int, ...] | None = None, + chunks: tuple[int, int] | None = None, ) -> DaskArray: import dask.array as da @@ -91,15 +91,16 @@ def read_sparse_as_dask( is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" stride: int = _DEFAULT_STRIDE + major_dim, minor_dim = (1, 0) if is_csc else (0, 1) if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[int(not is_csc)] != shape[int(not is_csc)]: + if chunks[minor_dim] != shape[minor_dim]: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[int(is_csc)] + stride = chunks[major_dim] @require_block_info def make_dask_chunk(block_info: BlockInfo): @@ -107,10 +108,12 @@ def make_dask_chunk(block_info: BlockInfo): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - array_location = block_info[None]["array-location"] + (xxx_start, xxx_end), (yyy_start, yyy_end) = block_info[None][ + "array-location" + ] index = ( - slice(array_location[0][0], array_location[0][1]), - slice(array_location[1][0], array_location[1][1]), + slice(xxx_start, xxx_end), + slice(yyy_start, yyy_end), ) chunk = mtx[index] return chunk @@ -150,8 +153,8 @@ def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): - array_location = block_info[None]["array-location"][i] - idx += (slice(array_location[0], array_location[1]),) + (start, stop) = block_info[None]["array-location"][i] + idx += (slice(start, stop),) return f[idx] chunk_layout = tuple( From 75a64fc3bd573d5a4840b83ec0041c171326dfc3 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:03:27 +0200 Subject: [PATCH 175/348] block info types --- src/anndata/_io/specs/lazy_methods.py | 38 +++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f51c0f684..f7630bd63 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -16,18 +16,24 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Mapping, Sequence from typing import Concatenate, Literal, ParamSpec, TypeVar from anndata.compat import DaskArray from .registry import DaskReader - BlockInfo = dict[ + BlockInfo2D = Mapping[ Literal[None], - dict[str, tuple[int, ...] | list[tuple[int, ...]]], + dict[str, tuple[tuple[int, int], tuple[int, int]]], ] + BlockInfoND = Mapping[ + Literal[None], + dict[str, Sequence[tuple[int, int]]], + ] + + B = TypeVar("B", BlockInfo2D, BlockInfoND) P = ParamSpec("P") R = TypeVar("R") @@ -58,12 +64,10 @@ def compute_chunk_layout_for_axis_shape( def require_block_info( - f: Callable[Concatenate[BlockInfo, P], R], -) -> Callable[Concatenate[BlockInfo | None, P], R]: + f: Callable[Concatenate[B, P], R], +) -> Callable[Concatenate[B | None, P], R]: @wraps(f) - def wrapper( - block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs - ) -> R: + def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R: if block_info is None: msg = "Block info is required" raise ValueError(msg) @@ -80,7 +84,7 @@ def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, _reader: DaskReader, - chunks: tuple[int, int] | None = None, + chunks: tuple[int, ...] | None = None, # only tuple[int, int] is supported here ) -> DaskArray: import dask.array as da @@ -103,19 +107,13 @@ def read_sparse_as_dask( stride = chunks[major_dim] @require_block_info - def make_dask_chunk(block_info: BlockInfo): + def make_dask_chunk(block_info: BlockInfo2D): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - (xxx_start, xxx_end), (yyy_start, yyy_end) = block_info[None][ - "array-location" - ] - index = ( - slice(xxx_start, xxx_end), - slice(yyy_start, yyy_end), - ) - chunk = mtx[index] + range_i, range_j = block_info[None]["array-location"] + chunk = mtx[slice(*range_i), slice(*range_j)] return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] @@ -141,7 +139,7 @@ def read_h5_array( import dask.array as da path = Path(elem.file.filename) - elem_name = elem.name + elem_name: str = elem.name shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( @@ -149,7 +147,7 @@ def read_h5_array( ) @require_block_info - def make_dask_chunk(block_info: BlockInfo): + def make_dask_chunk(block_info: BlockInfoND): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): From 3f734fe24ad30cef67c7401283c224529f94de2f Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:13:27 +0200 Subject: [PATCH 176/348] simplify --- src/anndata/_io/specs/lazy_methods.py | 36 +++++++++++++-------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f7630bd63..4084fcf41 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -23,17 +23,11 @@ from .registry import DaskReader - BlockInfo2D = Mapping[ - Literal[None], - dict[str, tuple[tuple[int, int], tuple[int, int]]], - ] - - BlockInfoND = Mapping[ + BlockInfo = Mapping[ Literal[None], dict[str, Sequence[tuple[int, int]]], ] - B = TypeVar("B", BlockInfo2D, BlockInfoND) P = ParamSpec("P") R = TypeVar("R") @@ -64,10 +58,12 @@ def compute_chunk_layout_for_axis_shape( def require_block_info( - f: Callable[Concatenate[B, P], R], -) -> Callable[Concatenate[B | None, P], R]: + f: Callable[Concatenate[BlockInfo, P], R], +) -> Callable[Concatenate[BlockInfo | None, P], R]: @wraps(f) - def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R: + def wrapper( + block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs + ) -> R: if block_info is None: msg = "Block info is required" raise ValueError(msg) @@ -76,6 +72,12 @@ def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R return wrapper +def get_chunks_indexer(block_info: BlockInfo) -> tuple[slice, ...]: + return tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -107,13 +109,13 @@ def read_sparse_as_dask( stride = chunks[major_dim] @require_block_info - def make_dask_chunk(block_info: BlockInfo2D): + def make_dask_chunk(block_info: BlockInfo): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - range_i, range_j = block_info[None]["array-location"] - chunk = mtx[slice(*range_i), slice(*range_j)] + xs, ys = get_chunks_indexer(block_info) + chunk = mtx[xs, ys] return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] @@ -147,13 +149,9 @@ def read_h5_array( ) @require_block_info - def make_dask_chunk(block_info: BlockInfoND): + def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: - idx = () - for i in range(len(shape)): - (start, stop) = block_info[None]["array-location"][i] - idx += (slice(start, stop),) - return f[idx] + return f[get_chunks_indexer(block_info)] chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) From c4c2356171e27ee60b4b6ac75ee25965303a4f79 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:15:11 +0200 Subject: [PATCH 177/348] rename --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 4084fcf41..e0153e17b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -72,7 +72,7 @@ def wrapper( return wrapper -def get_chunks_indexer(block_info: BlockInfo) -> tuple[slice, ...]: +def get_array_ranges(block_info: BlockInfo) -> tuple[slice, ...]: return tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) @@ -114,7 +114,7 @@ def make_dask_chunk(block_info: BlockInfo): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - xs, ys = get_chunks_indexer(block_info) + xs, ys = get_array_ranges(block_info) chunk = mtx[xs, ys] return chunk @@ -151,7 +151,7 @@ def read_h5_array( @require_block_info def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: - return f[get_chunks_indexer(block_info)] + return f[get_array_ranges(block_info)] chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) From cc67a9b54a6690847bae3701fd6daf64a7678ab2 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:35:34 +0200 Subject: [PATCH 178/348] simplify more --- src/anndata/_io/specs/lazy_methods.py | 81 ++++++++++++--------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e0153e17b..8a1b31e6b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from functools import wraps +from functools import partial from pathlib import Path from typing import TYPE_CHECKING @@ -10,17 +10,18 @@ from scipy import sparse import anndata as ad -from anndata._core.file_backing import filename, get_elem_name -from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup +from ..._core.file_backing import filename, get_elem_name +from ...compat import H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable, Mapping, Sequence - from typing import Concatenate, Literal, ParamSpec, TypeVar - - from anndata.compat import DaskArray + from collections.abc import Callable, Generator, Mapping, Sequence + from typing import Literal, ParamSpec, TypeVar + from ..._core.sparse_dataset import CSCDataset, CSRDataset + from ..._types import ArrayStorageType, StorageType + from ...compat import DaskArray from .registry import DaskReader BlockInfo = Mapping[ @@ -33,7 +34,9 @@ @contextmanager -def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): +def maybe_open_h5( + path_or_group: Path | ZarrGroup, elem_name: str +) -> Generator[StorageType, None, None]: if not isinstance(path_or_group, Path): yield path_or_group return @@ -57,25 +60,26 @@ def compute_chunk_layout_for_axis_shape( return chunk -def require_block_info( - f: Callable[Concatenate[BlockInfo, P], R], -) -> Callable[Concatenate[BlockInfo | None, P], R]: - @wraps(f) - def wrapper( - block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs - ) -> R: - if block_info is None: - msg = "Block info is required" - raise ValueError(msg) - return f(block_info, *args, **kwargs) - - return wrapper - - -def get_array_ranges(block_info: BlockInfo) -> tuple[slice, ...]: - return tuple( - slice(start, stop) for start, stop in block_info[None]["array-location"] - ) +def make_dask_chunk( + path_or_group: Path | ZarrGroup, + elem_name: str, + block_info: BlockInfo | None = None, + *, + wrap: Callable[[ArrayStorageType], ArrayStorageType] + | Callable[[H5Group | ZarrGroup], CSRDataset | CSCDataset] = lambda g: g, +): + if block_info is None: + msg = "Block info is required" + raise ValueError(msg) + # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` + # https://github.com/scverse/anndata/issues/1105 + with maybe_open_h5(path_or_group, elem_name) as f: + mtx = wrap(f) + idx = tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + chunk = mtx[idx] + return chunk @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @@ -108,16 +112,6 @@ def read_sparse_as_dask( ) stride = chunks[major_dim] - @require_block_info - def make_dask_chunk(block_info: BlockInfo): - # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` - # https://github.com/scverse/anndata/issues/1105 - with maybe_open_h5(path_or_group, elem_name) as f: - mtx = ad.experimental.sparse_dataset(f) - xs, ys = get_array_ranges(block_info) - chunk = mtx[xs, ys] - return chunk - shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) chunks_minor = (shape_minor,) @@ -125,8 +119,11 @@ def make_dask_chunk(block_info: BlockInfo): (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix + make_chunk = partial( + make_dask_chunk, path_or_group, elem_name, wrap=ad.experimental.sparse_dataset + ) da_mtx = da.map_blocks( - make_dask_chunk, + make_chunk, dtype=dtype, chunks=chunk_layout, meta=memory_format((0, 0), dtype=dtype), @@ -148,17 +145,13 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - @require_block_info - def make_dask_chunk(block_info: BlockInfo): - with maybe_open_h5(path, elem_name) as f: - return f[get_array_ranges(block_info)] - chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) for i in range(len(shape)) ) - return da.map_blocks(make_dask_chunk, dtype=dtype, chunks=chunk_layout) + make_chunk = partial(make_dask_chunk, path, elem_name) + return da.map_blocks(make_chunk, dtype=dtype, chunks=chunk_layout) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) From fcb1763c2ef5da705e0c3d628abd3eae78eadef7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 23 Jul 2024 10:27:10 +0200 Subject: [PATCH 179/348] (fix): migrate to use `read_elem` infrastructure --- docs/api.md | 2 +- docs/release-notes/0.11.0.md | 2 +- src/anndata/_core/views.py | 6 ++ src/anndata/_io/specs/__init__.py | 4 +- src/anndata/_io/specs/lazy_methods.py | 115 ++++++++++++++++++++++--- src/anndata/_io/specs/registry.py | 21 ++--- src/anndata/_types.py | 17 ++-- src/anndata/experimental/__init__.py | 4 +- src/anndata/experimental/backed/_io.py | 79 ++++------------- tests/test_concatenate.py | 3 + tests/test_io_elementwise.py | 16 ++-- 11 files changed, 163 insertions(+), 106 deletions(-) diff --git a/docs/api.md b/docs/api.md index 92139fe06..40dbbd92a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -121,7 +121,7 @@ Low level methods for reading and writing elements of an `AnnData` object to a s experimental.read_elem experimental.write_elem - experimental.read_elem_as_dask + experimental.read_elem_lazy ``` Utilities for customizing the IO process: diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index 618d4f549..2ee386dd7 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -8,7 +8,7 @@ * Add `should_remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold` * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` * Add `should_check_uniqueness` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1507` {user}`ilan-gold` -* Add :func:`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` +* Add :func:`~anndata.experimental.read_elem_lazy` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` * Add functionality to write from GPU {class}`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` #### Bugfix diff --git a/src/anndata/_core/views.py b/src/anndata/_core/views.py index 271353689..ada322300 100644 --- a/src/anndata/_core/views.py +++ b/src/anndata/_core/views.py @@ -13,6 +13,7 @@ from anndata._warnings import ImplicitModificationWarning +from .._settings import settings from ..compat import ( AwkArray, CupyArray, @@ -292,6 +293,11 @@ def as_view_dask_array(array, view_args): @as_view.register(pd.DataFrame) def as_view_df(df, view_args): + if settings.should_remove_unused_categories: + for col in df.columns: + if isinstance(df[col].dtype, pd.CategoricalDtype): + with pd.option_context("mode.chained_assignment", None): + df[col] = df[col].cat.remove_unused_categories() return DataFrameView(df, view_args=view_args) diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index 5eadfdb50..8fd9898a3 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -9,7 +9,7 @@ Writer, get_spec, read_elem, - read_elem_as_dask, + read_elem_lazy, write_elem, ) @@ -19,7 +19,7 @@ "write_elem", "get_spec", "read_elem", - "read_elem_as_dask", + "read_elem_lazy", "Reader", "Writer", "IOSpec", diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 19efd61e9..cd1af34a0 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from functools import wraps +from functools import partial, wraps from pathlib import Path from typing import TYPE_CHECKING @@ -11,17 +11,18 @@ import anndata as ad from anndata._core.file_backing import filename, get_elem_name -from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup +from anndata.compat import DaskArray, H5Array, H5Group, ZarrArray, ZarrGroup +from ..._settings import settings from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: from collections.abc import Callable from typing import Concatenate, Literal, ParamSpec, TypeVar - from anndata.compat import DaskArray + from anndata.experimental.backed._xarray import Dataset2D - from .registry import DaskReader + from .registry import LazyReader BlockInfo = dict[ Literal[None], @@ -79,7 +80,7 @@ def wrapper( def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, - _reader: DaskReader, + _reader: LazyReader, chunks: tuple[int, int] | None = None, ) -> DaskArray: import dask.array as da @@ -137,8 +138,9 @@ def make_dask_chunk(block_info: BlockInfo): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) def read_h5_string_array( elem: H5Array, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + *, + _reader: LazyReader, + chunks: tuple[int, int] | None = None, ): import dask.array as da @@ -146,13 +148,13 @@ def read_h5_string_array( return da.from_array( read_dataset(elem), - chunks=dataset_kwargs.get("chunks", (_DEFAULT_STRIDE,) * len(elem.shape)), + chunks=chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(elem.shape), ) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: H5Array, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -184,9 +186,102 @@ def make_dask_chunk(block_info: BlockInfo): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da return da.from_zarr(elem, chunks=chunks) + + +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) +def read_dataframe( + elem: H5Group | ZarrGroup, + *, + _reader: LazyReader, + chunks: tuple[int, ...] | None = None, +) -> Dataset2D: + from anndata.experimental.backed._compat import xr + from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray + from anndata.experimental.backed._xarray import Dataset2D + + iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ + (elem.attrs["_index"], elem[elem.attrs["_index"]]) + ] + d = {k: _reader.read_elem(v) for k, v in iter_object} + d_with_xr = {} + elem_name = get_elem_name(elem) + index_label = f'{elem_name.replace("/", "")}_names' + index = d[elem.attrs["_index"]] # no sense in reading this in multiple times + for k in d: + v = d[k] + if type(v) == DaskArray and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray(v, coords=[index], dims=[index_label], name=k) + elif ( + type(v) == CategoricalArray or type(v) == MaskedArray + ) and k != elem.attrs["_index"]: + variable = xr.Variable( + data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] + ) + d_with_xr[k] = xr.DataArray( + variable, + coords=[index], + dims=[index_label], + name=k, + ) + elif k == elem.attrs["_index"]: + d_with_xr[index_label] = xr.DataArray( + v, coords=[v], dims=[index_label], name=index_label + ) + else: + d_with_xr[k] = v + return Dataset2D(d_with_xr) + + +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) +def read_categorical( + elem: H5Group | ZarrGroup, + *, + _reader: LazyReader, + chunks: tuple[int, ...] | None = None, +): + from anndata.experimental.backed._lazy_arrays import CategoricalArray + + return CategoricalArray( + codes=elem["codes"], + categories=elem["categories"], + ordered=elem.attrs["ordered"], + drop_unused_cats=settings.should_remove_unused_categories, + ) + + +def read_nullable( + elem: H5Group | ZarrGroup, + *, + encoding_type: str, + _reader: LazyReader, + chunks: tuple[int, ...] | None = None, +): + from anndata.experimental.backed._lazy_arrays import MaskedArray + + return MaskedArray( + values=elem["values"], + mask=elem["mask"] if "mask" in elem else None, + dtype_str=encoding_type, + ) + + +_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( + partial(read_nullable, encoding_type="nullable-integer") +) +_LAZY_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( + partial(read_nullable, encoding_type="nullable-integer") +) +_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( + partial(read_nullable, encoding_type="nullable-boolean") +) +_LAZY_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( + partial(read_nullable, encoding_type="nullable-boolean") +) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index e3003cc52..82bcf49c5 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Generic, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata._types import Read, ReadDask, _ReadDaskInternal, _ReadInternal +from anndata._types import Read, ReadLazy, _ReadInternal, _ReadLazyInternal from anndata.compat import DaskArray, _read_attr if TYPE_CHECKING: @@ -24,6 +24,7 @@ WriteCallback, _WriteInternal, ) + from anndata.experimental.backed._xarray import Dataset2D T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) @@ -78,8 +79,8 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -_R = TypeVar("_R", _ReadInternal, _ReadDaskInternal) -R = TypeVar("R", Read, ReadDask) +_R = TypeVar("_R", _ReadInternal, _ReadLazyInternal) +R = TypeVar("R", Read, ReadLazy) class IORegistry(Generic[_R, R]): @@ -213,7 +214,7 @@ def get_spec(self, elem: Any) -> IOSpec: _REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry() -_LAZY_REGISTRY: IORegistry[_ReadDaskInternal, ReadDask] = IORegistry() +_LAZY_REGISTRY: IORegistry[_ReadLazyInternal, ReadLazy] = IORegistry() @singledispatch @@ -282,18 +283,18 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class DaskReader(Reader): +class LazyReader(Reader): @report_read_key_on_error def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, - ) -> DaskArray: + ) -> DaskArray | Dataset2D: """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func: ReadDask = self.registry.get_read( + read_func: ReadLazy = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is not None: @@ -378,9 +379,9 @@ def read_elem(elem: StorageType) -> InMemoryElem: return Reader(_REGISTRY).read_elem(elem) -def read_elem_as_dask( +def read_elem_lazy( elem: StorageType, chunks: tuple[int, ...] | None = None -) -> DaskArray: +) -> DaskArray | Dataset2D: """ Read an element from a store lazily. @@ -400,7 +401,7 @@ def read_elem_as_dask( ------- DaskArray """ - return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) + return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) def write_elem( diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 3549152f5..2fdb8c03e 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -31,7 +31,8 @@ from collections.abc import Mapping from typing import Any, TypeAlias - from anndata._io.specs.registry import DaskReader + from anndata._io.specs.registry import LazyReader + from anndata.experimental.backed._xarray import Dataset2D from ._io.specs.registry import IOSpec, Reader, Writer @@ -89,10 +90,10 @@ class _ReadInternal(Protocol[SCon, CovariantInMemoryType]): def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantInMemoryType: ... -class _ReadDaskInternal(Protocol[SCon]): +class _ReadLazyInternal(Protocol[SCon]): def __call__( - self, elem: SCon, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None - ) -> DaskArray: ... + self, elem: SCon, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None + ) -> DaskArray | Dataset2D: ... class Read(Protocol[SCon, CovariantInMemoryType]): @@ -110,11 +111,11 @@ def __call__(self, elem: SCon) -> CovariantInMemoryType: ... -class ReadDask(Protocol[SCon]): +class ReadLazy(Protocol[SCon]): def __call__( self, elem: SCon, *, chunks: tuple[int, ...] | None = None - ) -> DaskArray: - """Low-level reading function for a dask element. + ) -> DaskArray | Dataset2D: + """Low-level reading function for a lazy element. Parameters ---------- @@ -124,7 +125,7 @@ def __call__( The chunk size to be used. Returns ------- - The dask element read from the store. + The lazy element read from the store. """ ... diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 535d2ca2d..b75440a95 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset -from anndata._io.specs import IOSpec, read_elem, read_elem_as_dask, write_elem +from anndata._io.specs import IOSpec, read_elem, read_elem_lazy, write_elem from .._core.storage import StorageType from .._types import InMemoryElem as _InMemoryElem @@ -24,7 +24,7 @@ "AnnLoader", "read_elem", "write_elem", - "read_elem_as_dask", + "read_elem_lazy", "read_dispatched", "write_dispatched", "IOSpec", diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 582f64eb7..bf654d89a 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -8,14 +8,10 @@ import h5py -from anndata._io.specs.registry import read_elem_as_dask +from anndata._io.specs.registry import read_elem_lazy from ..._core.anndata import AnnData -from ...compat import DaskArray from .. import read_dispatched -from ._compat import xr -from ._lazy_arrays import CategoricalArray, MaskedArray -from ._xarray import Dataset2D if TYPE_CHECKING: from collections.abc import MutableMapping @@ -59,7 +55,7 @@ def read_backed( else: f = h5py.File(store, mode="r") - def callback(func, elem_name: str, elem, iospec, dataset_kwargs): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): cols = [ "obs", @@ -79,64 +75,19 @@ def callback(func, elem_name: str, elem, iospec, dataset_kwargs): return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) elif elem_name.startswith("/raw"): return None - elif iospec.encoding_type in {"dataframe"}: - iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ - (elem.attrs["_index"], elem[elem.attrs["_index"]]) - ] - d = {k: read_dispatched(v, callback) for k, v in iter_object} - d_with_xr = {} - index_label = f'{elem_name.replace("/", "")}_names' - index = d[ - elem.attrs["_index"] - ] # no sense in reading this in multiple times - for k in d: - v = d[k] - if type(v) == DaskArray and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray( - v, coords=[index], dims=[index_label], name=k - ) - elif ( - type(v) == CategoricalArray or type(v) == MaskedArray - ) and k != elem.attrs["_index"]: - variable = xr.Variable( - data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] - ) - d_with_xr[k] = xr.DataArray( - variable, - coords=[index], - dims=[index_label], - name=k, - ) - elif k == elem.attrs["_index"]: - d_with_xr[index_label] = xr.DataArray( - v, coords=[v], dims=[index_label], name=index_label - ) - else: - d_with_xr[k] = v - return Dataset2D(d_with_xr) - elif iospec.encoding_type == "categorical": - drop_unused_cats = not ( - elem_name.startswith("/obsm") or elem_name.startswith("/varm") - ) - return CategoricalArray( - codes=elem["codes"], - categories=elem["categories"], - ordered=elem.attrs["ordered"], - drop_unused_cats=drop_unused_cats, - ) - elif "nullable" in iospec.encoding_type: - return MaskedArray( - values=elem["values"], - mask=elem["mask"] if "mask" in elem else None, - dtype_str=iospec.encoding_type, - ) - elif iospec.encoding_type in { - "csr_matrix", - "csc_matrix", - "array", - "string-array", - }: - return read_elem_as_dask(elem) + elif ( + iospec.encoding_type + in { + "csr_matrix", + "csc_matrix", + "array", + "string-array", + "dataframe", + "categorical", + } + or "nullable" in iospec.encoding_type + ): + return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) return func(elem) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 0e1cd7382..ad9b2bb71 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -152,6 +152,9 @@ def fix_known_differences(orig, result, backwards_compat=True): for k, dtype in orig.obs.dtypes.items(): if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: result.obs[k] = result.obs[k].astype(dtype) + for k, dtype in orig.obsm["df"].dtypes.items(): + if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: + result.obsm["df"][k] = result.obsm["df"][k].astype(dtype) return orig, result diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 62284a0c9..094ab505a 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -21,7 +21,7 @@ IOSpec, get_spec, read_elem, - read_elem_as_dask, + read_elem_lazy, write_elem, ) from anndata._io.specs.registry import IORegistryError @@ -230,7 +230,7 @@ def test_dask_write_sparse(sparse_format, store): def test_read_lazy_2d_dask(sparse_format, store): arr_store = create_sparse_store(sparse_format, store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_dask_from_disk = read_elem_lazy(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) @@ -267,7 +267,7 @@ def test_read_lazy_2d_dask(sparse_format, store): ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) - X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) + X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) @@ -285,7 +285,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): with h5py.File(tmp_path / "test.h5", "w") as file: store = file["/"] arr_store = create_sparse_store(sparse_format, store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_dask_from_disk = read_elem_lazy(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) with ( dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, @@ -307,10 +307,10 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): if arr_type == "dense": arr_store = create_dense_store(store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) + X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) + X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) if chunks is not None: assert X_dask_from_disk.chunksize == chunks else: @@ -329,9 +329,9 @@ def test_read_lazy_bad_chunk_kwargs(tmp_path): with pytest.raises( ValueError, match=r"`chunks` must be a tuple of two integers" ): - read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) + read_elem_lazy(arr_store["X"], chunks=(SIZE,)) with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): - read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) + read_elem_lazy(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From 4c659a17e843f5baefddbe844e7438ded92e29d4 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 23 Jul 2024 11:17:50 +0200 Subject: [PATCH 180/348] (fix): no first access of categories --- .../experimental/backed/_lazy_arrays.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index b9a0857d5..cc0170bb3 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -42,6 +42,34 @@ def __getitem__(self, key): ) +# Prevents first access from having to load the categories array +class CategoriesAccessor: + def __init__(self, categories: ZarrArray | H5Array, ordered: bool): + self._categories = categories + self._ordered = ordered + self._dtype = None + + def __get__(self, obj, objtype=None): + return self.dtype + + @property + def dtype(self): + if self._dtype is None: + self._dtype = pd.CategoricalDtype( + categories=self._categories, ordered=self._ordered + ) + return self._dtype + + def __getattr__(self, name): + return getattr(self.dtype, name) + + def __repr__(self): + return repr(self.dtype) + + def __str__(self) -> str: + return str(self.dtype) + + class CategoricalArray(BackendArray): def __init__( self, @@ -58,7 +86,7 @@ def __init__( self._categories_cache = None self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) self.shape = self._codes.shape - self.dtype = pd.CategoricalDtype( + self.dtype = CategoriesAccessor( categories=self._categories, ordered=self._ordered ) From d3a811a1c7aae95dec5cf5bc8573068d632d47e0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 23 Jul 2024 11:48:03 +0200 Subject: [PATCH 181/348] (fix): last small cleanups --- docs/api.md | 4 ++++ docs/conf.py | 1 + docs/release-notes/0.11.0.md | 4 ++-- src/anndata/_core/storage.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 9 ++++----- src/anndata/_io/specs/registry.py | 5 +++-- src/anndata/experimental/backed/_io.py | 5 ++--- src/anndata/experimental/backed/_lazy_arrays.py | 12 +++++------- 8 files changed, 22 insertions(+), 20 deletions(-) diff --git a/docs/api.md b/docs/api.md index 40dbbd92a..d619ee2d9 100644 --- a/docs/api.md +++ b/docs/api.md @@ -122,6 +122,7 @@ Low level methods for reading and writing elements of an `AnnData` object to a s experimental.read_elem experimental.write_elem experimental.read_elem_lazy + experimental.read_backed ``` Utilities for customizing the IO process: @@ -148,6 +149,9 @@ Types used by the former: experimental.ReadCallback experimental.WriteCallback experimental.StorageType + experimental.backed._lazy_arrays.MaskedArray + experimental.backed._lazy_arrays.CategoricalArray + experimental.backed._xarray.Dataset2D ``` ## Errors and warnings diff --git a/docs/conf.py b/docs/conf.py index 5b1b95f30..d74cb81fe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -104,6 +104,7 @@ ("py:class", "numpy.ma.core.MaskedArray"), ("py:class", "dask.array.core.Array"), ("py:class", "awkward.highlevel.Array"), + ("py:class", "awkward.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), ] diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index 2574df80f..93c55692a 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -9,8 +9,8 @@ * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` * Add `should_check_uniqueness` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1507` {user}`ilan-gold` * Add :func:`~anndata.experimental.read_elem_lazy` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` -* Add functionality to write from GPU :class:`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` -* Add functionality to :func:`~anndata.experimental.read_elem_as_dask` to handle backed dataframes as well as a :func:`~anndata.experimental.read_backed` to handle reading in as much of the :class:`~anndata.AnnData` as possible backed {pr}`1247` {user}`ilan-gold` +* Add functionality to write from GPU {class}`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` +* Add functionality to :func:`~anndata.experimental.read_elem_as_dask` to handle backed dataframes as well as a :func:`~anndata.experimental.read_backed` to handle reading in as much of the {class}`~anndata.AnnData` as possible backed {pr}`1247` {user}`ilan-gold` #### Bugfix diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index f7ef4f700..f280ecd33 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -88,7 +88,7 @@ def __repr__(): return value # If value is one of the allowed types, return it - if isinstance(value, StorageType.classes()) or isinstance(value, Dataset2D): # ???? + if isinstance(value, StorageType.classes()) or isinstance(value, Dataset2D): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warnings.warn(msg, ImplicitModificationWarning) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 20160e39e..25310d592 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -13,13 +13,13 @@ from anndata._core.file_backing import filename, get_elem_name from anndata.compat import DaskArray, H5Array, H5Group, ZarrArray, ZarrGroup -from ..._settings import settings from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: from collections.abc import Callable, Generator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar + from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from anndata.experimental.backed._xarray import Dataset2D from ..._core.sparse_dataset import CSCDataset, CSRDataset @@ -139,7 +139,7 @@ def read_h5_string_array( *, _reader: LazyReader, chunks: tuple[int, int] | None = None, -): +) -> DaskArray: import dask.array as da from anndata._io.h5ad import read_dataset @@ -236,14 +236,13 @@ def read_categorical( *, _reader: LazyReader, chunks: tuple[int, ...] | None = None, -): +) -> CategoricalArray: from anndata.experimental.backed._lazy_arrays import CategoricalArray return CategoricalArray( codes=elem["codes"], categories=elem["categories"], ordered=elem.attrs["ordered"], - drop_unused_cats=settings.should_remove_unused_categories, ) @@ -253,7 +252,7 @@ def read_nullable( encoding_type: str, _reader: LazyReader, chunks: tuple[int, ...] | None = None, -): +) -> MaskedArray: from anndata.experimental.backed._lazy_arrays import MaskedArray return MaskedArray( diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 82bcf49c5..72347dd7f 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -24,6 +24,7 @@ WriteCallback, _WriteInternal, ) + from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from anndata.experimental.backed._xarray import Dataset2D T = TypeVar("T") @@ -290,7 +291,7 @@ def read_elem( elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, - ) -> DaskArray | Dataset2D: + ) -> DaskArray | Dataset2D | CategoricalArray | MaskedArray: """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) @@ -381,7 +382,7 @@ def read_elem(elem: StorageType) -> InMemoryElem: def read_elem_lazy( elem: StorageType, chunks: tuple[int, ...] | None = None -) -> DaskArray | Dataset2D: +) -> DaskArray | Dataset2D | CategoricalArray | MaskedArray: """ Read an element from a store lazily. diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index bf654d89a..988e615da 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -23,12 +23,11 @@ def read_backed( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, ) -> AnnData: """Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. - No array data should need to be read into memory with the exceptio of Awkward Arrays and some older-encoding string arrays. + No array data should need to be read into memory with the exception of :class:`awkward.Array` and some older-encoding string arrays. Params ------ - store: A store-like object to be read in. If :doc:`zarr:index`, it is best - for it to be consolidated. + store: A store-like object to be read in. If :doc:`zarr:index`, it is best for it to be consolidated. Returns ------- diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index cc0170bb3..0b1504e8a 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -9,6 +9,7 @@ from anndata._core.views import as_view from anndata.compat import H5Array, ZarrArray +from ..._settings import settings from ._compat import BackendArray, DataArray, ZarrArrayWrapper, xr if TYPE_CHECKING: @@ -43,7 +44,7 @@ def __getitem__(self, key): # Prevents first access from having to load the categories array -class CategoriesAccessor: +class CategoricalDtypeAccessor: def __init__(self, categories: ZarrArray | H5Array, ordered: bool): self._categories = categories self._ordered = ordered @@ -60,7 +61,7 @@ def dtype(self): ) return self._dtype - def __getattr__(self, name): + def __getattr__(self, name: str): return getattr(self.dtype, name) def __repr__(self): @@ -76,17 +77,15 @@ def __init__( codes: ZarrArray | H5Array, categories: ZarrArray | H5Array, ordered: bool, - drop_unused_cats: bool = False, *args, **kwargs, ): self._categories = categories self._ordered = ordered - self._drop_unused_cats = drop_unused_cats self._categories_cache = None self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) self.shape = self._codes.shape - self.dtype = CategoriesAccessor( + self.dtype = CategoricalDtypeAccessor( categories=self._categories, ordered=self._ordered ) @@ -110,7 +109,7 @@ def __getitem__( categorical_array = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self._ordered ) - if self._drop_unused_cats: + if settings.should_remove_unused_categories: return xr.core.extension_array.PandasExtensionArray( categorical_array.remove_unused_categories() ) @@ -131,7 +130,6 @@ def __init__( self.dtype = pd.api.types.pandas_dtype(self._values.dtype) def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: - # HACK! TODO(ilan-gold): open issue about hdf5 compat that doesn't allow initialization! values = self._values[key] if self._mask is not None: mask = self._mask[key] From e852a748836e8654943f2bf81505ea924b0d82ed Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 23 Jul 2024 13:50:12 +0200 Subject: [PATCH 182/348] (fix): try not runnign `xarray` tests --- ci/scripts/min-deps.py | 4 +--- tests/test_read_backed_experimental.py | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index 878a86566..b5ed77264 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -62,9 +62,7 @@ def extract_min_deps( for extra in req.extras: extra_deps = pyproject["project"]["optional-dependencies"][extra] dependencies += map(Requirement, extra_deps) - elif req.name == "xarray": - continue # xarray requires too high a version of pandas and is experimental anyway - else: + elif req.name != "xarray": yield min_dep(req) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 9b1a77069..a8adcdc0a 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -1,5 +1,6 @@ from __future__ import annotations +from importlib.util import find_spec from pathlib import Path import numpy as np @@ -36,6 +37,7 @@ def dskfmt(request): return request.param +@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") def test_access_count_obs_var(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -90,6 +92,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): ) # never accessed +@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") def test_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -101,6 +104,7 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): assert_equal(remote_to_memory, adata) +@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) From 8c92a41fe6c49dc143cab58bdfc22cad7d8cf42b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 23 Jul 2024 14:12:38 +0200 Subject: [PATCH 183/348] (fix): oops! forgot one test to mark! --- tests/test_read_backed_experimental.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index a8adcdc0a..4cb900ad2 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -119,6 +119,7 @@ def test_view_to_memory(tmp_path, mtx_format, dskfmt): assert_equal(adata[:, subset_var], remote[:, subset_var].to_memory()) +@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) From 55f706f727827062e86361b0a40ae8a7ffe6ec60 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 6 Aug 2024 08:09:30 -0400 Subject: [PATCH 184/348] Update pyproject.toml Co-authored-by: Philipp A. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 503b59045..05d864a1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,7 +99,7 @@ test = [ "pyarrow", "pytest_memray", "pytest-mock", - "xarray>=2024.06.0" + "xarray>=2024.06.0", ] gpu = [ "cupy", From 6fa97f043a5a763bfa2585b373d6342ebd4c050b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 08:10:55 -0400 Subject: [PATCH 185/348] (fix): change unused category function from method to function --- src/anndata/_core/anndata.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 32172b53c..4c46b9b31 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -314,8 +314,8 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): # fix categories uns = copy(adata_ref._uns) if settings.should_remove_unused_categories: - self._remove_unused_categories(adata_ref.obs, obs_sub, uns) - self._remove_unused_categories(adata_ref.var, var_sub, uns) + _remove_unused_categories(adata_ref.obs, obs_sub, uns) + _remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = as_view(obs_sub, view_args=(self, "obs")) self._var = as_view(var_sub, view_args=(self, "var")) @@ -1038,14 +1038,6 @@ def __getitem__(self, index: Index) -> AnnData: oidx, vidx = self._normalize_indices(index) return AnnData(self, oidx=oidx, vidx=vidx, asview=True) - def _remove_unused_categories( - self, - df_full: pd.DataFrame, - df_sub: pd.DataFrame, - uns: dict[str, Any], # types are wrong now... - ): - _remove_unused_categories(df_full, df_sub, uns) - def rename_categories(self, key: str, categories: Sequence[Any]): """\ Rename categories of annotation `key` in :attr:`obs`, :attr:`var`, From eb1237c458120acd891ffe5c1386bd69a3d383be Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 10:56:17 -0400 Subject: [PATCH 186/348] (fix): actually track keys instead of relying on `deafultdict` behavior --- src/anndata/tests/helpers.py | 8 ++++++++ tests/test_read_backed_experimental.py | 1 + 2 files changed, 9 insertions(+) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 7100d0f4d..18cf24ad4 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -983,12 +983,20 @@ def __getitem__(self, key): return super().__getitem__(key) def get_access_count(self, key): + # access defaultdict when value is not there causes key to be there, + # which causes it to be tracked + if key not in self._access_count: + raise ValueError(f"{key} not found among access count") return self._access_count[key] def get_subkeys_accessed(self, key): + if key not in self._accessed: + raise ValueError(f"{key} not found among accessed") return self._accessed[key] def get_accessed_keys(self, key): + if key not in self._accessed_keys: + raise ValueError(f"{key} not found among accessed keys") return self._accessed_keys[key] def initialize_key_trackers(self, keys_to_track): diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 4cb900ad2..2daafe3ed 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -55,6 +55,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) remote = read_backed(store) + store.initialize_key_trackers(["obs/cat/codes", "obs/int64", "var/int64", "X"]) # a series of methods that should __not__ read in any data remote.X # the initial (non-subset) access to `X` should not read in data remote.shape From 6724c6271e02e03432112b3ea6262147ae7e0ed6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 10:56:35 -0400 Subject: [PATCH 187/348] (chore): test unconsolidated warning --- tests/test_read_backed_experimental.py | 32 ++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 2daafe3ed..cf772b557 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -37,7 +37,12 @@ def dskfmt(request): return request.param -@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") +needs_xarray = pytest.mark.skipif( + not find_spec("xarray"), reason="Xarray is not installed" +) + + +@needs_xarray def test_access_count_obs_var(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -93,7 +98,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): ) # never accessed -@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") +@needs_xarray def test_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -105,7 +110,7 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): assert_equal(remote_to_memory, adata) -@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") +@needs_xarray def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -120,7 +125,7 @@ def test_view_to_memory(tmp_path, mtx_format, dskfmt): assert_equal(adata[:, subset_var], remote[:, subset_var].to_memory()) -@pytest.mark.skipif(not find_spec("xarray"), reason="Xarray is not installed") +@needs_xarray def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -145,3 +150,22 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): subsetted_subsetted_adata, remote[:, subset_var][:, subset_subset_var].to_memory(), ) + + +@needs_xarray +def test_unconsolidated(tmp_path, mtx_format): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + write = lambda x: getattr(x, "write_zarr")(orig_pth) + write(adata) + (Path(orig_pth) / ".zmetadata").unlink() + store = AccessTrackingStore(orig_pth) + store.initialize_key_trackers(["obs/.zgroup", ".zgroup"]) + with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"): + remote = read_backed(store) + remote_to_memory = remote.to_memory() + assert_equal(remote_to_memory, adata) + assert store.get_access_count("obs/.zgroup") == 1, store.get_subkeys_accessed( + "obs/.zgroup" + ) From 53796a00ec4871bad9b40809c9b69b567ebfcd8e Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 6 Aug 2024 10:57:15 -0400 Subject: [PATCH 188/348] Update pyproject.toml Co-authored-by: Philipp A. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 05d864a1d..f929fa703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,7 +99,7 @@ test = [ "pyarrow", "pytest_memray", "pytest-mock", - "xarray>=2024.06.0", + "anndata[xarray]", ] gpu = [ "cupy", From 076b92fd7e2fe818c825e14d33e069e5e0dd8db9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 11:00:58 -0400 Subject: [PATCH 189/348] (fix): use `test-full`/`test` --- .azure-pipelines.yml | 4 ++-- ci/scripts/min-deps.py | 2 +- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 05b420d93..c77619c67 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -49,7 +49,7 @@ jobs: path: $(uv_cache_dir) displayName: Cache pip packages - - script: uv pip install --system --compile "anndata[dev,test] @ ." + - script: uv pip install --system --compile "anndata[dev,test-full] @ ." displayName: "Install dependencies" condition: eq(variables['DEPENDENCIES_VERSION'], 'latest') @@ -61,7 +61,7 @@ jobs: displayName: "Install minimum dependencies" condition: eq(variables['DEPENDENCIES_VERSION'], 'minimum') - - script: uv pip install -v --system --compile --pre "anndata[dev,test] @ ." "scanpy>=1.10.0rc1" + - script: uv pip install -v --system --compile --pre "anndata[dev,test-full] @ ." "scanpy>=1.10.0rc1" displayName: "Install dependencies release candidates" condition: eq(variables['DEPENDENCIES_VERSION'], 'pre-release') diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index b5ed77264..a7482e70e 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -62,7 +62,7 @@ def extract_min_deps( for extra in req.extras: extra_deps = pyproject["project"]["optional-dependencies"][extra] dependencies += map(Requirement, extra_deps) - elif req.name != "xarray": + else: yield min_dep(req) diff --git a/pyproject.toml b/pyproject.toml index f929fa703..11b8c0ef1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,8 +99,8 @@ test = [ "pyarrow", "pytest_memray", "pytest-mock", - "anndata[xarray]", ] +test-full = ["anndata[test]", "anndata[xarray]"] gpu = [ "cupy", "numpy<2.0.0", From 036ff3f544cde8d6c8266be96eea1b327d3264d8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 11:14:05 -0400 Subject: [PATCH 190/348] (fix): typing for `_gen_dataframe` --- src/anndata/_core/aligned_df.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 321264886..9fc6efb5b 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -1,6 +1,7 @@ from __future__ import annotations import warnings +from collections.abc import Mapping from functools import singledispatch from typing import TYPE_CHECKING @@ -10,13 +11,26 @@ from .._warnings import ImplicitModificationWarning if TYPE_CHECKING: - from collections.abc import Iterable, Mapping + from collections.abc import Iterable from typing import Any, Literal @singledispatch def _gen_dataframe( - anno: Mapping[str, Any], + anno: Any, + index_names: Iterable[str], + *, + source: Literal["X", "shape"], + attr: Literal["obs", "var"], + length: int | None = None, +) -> pd.DataFrame: + raise ValueError(f"Cannot convert {type(anno)} to {attr} DataFrame") + + +@_gen_dataframe.register(Mapping) +@_gen_dataframe.register(type(None)) +def _gen_dataframe_mapping( + anno: Mapping[str, Any] | None, index_names: Iterable[str], *, source: Literal["X", "shape"], From 9415a14f9531e1bd315cdafe2637544be2e053be Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 11:22:57 -0400 Subject: [PATCH 191/348] (chore): imrpoved comments for `Dataset2D` --- src/anndata/experimental/backed/_xarray.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index dac3ac261..27571aaab 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -28,17 +28,36 @@ def get_index_dim(ds): class Dataset2D(Dataset): @property def index(self) -> pd.Index: + """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.index` so this ensures usability + + Returns + ------- + The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. + """ coord = list(self.coords.keys())[0] return pd.Index(self.coords[coord].data) @property def shape( self, - ): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm + ): + """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability + + Returns + ------- + The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. + """ return [self.sizes[get_index_dim(self)], len(self)] @property def iloc(self): + """:attr:`~anndata.AnnData` internally looks for :meth:`~pandas.DataFrame.iloc` so this ensures usability + + Returns + ------- + Handler class for doing the iloc-style indexing using :meth:`~xarray.Dataset.isel`. + """ + class IlocGetter: def __init__(self, ds): self._ds = ds From b5dfaace45dbb568c07a3fe62030ce7f2f6d4402 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Aug 2024 11:39:54 -0400 Subject: [PATCH 192/348] (fix): `iloc` is an `attr` not a `meth` --- src/anndata/experimental/backed/_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 27571aaab..0e464549e 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -51,7 +51,7 @@ def shape( @property def iloc(self): - """:attr:`~anndata.AnnData` internally looks for :meth:`~pandas.DataFrame.iloc` so this ensures usability + """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.iloc` so this ensures usability Returns ------- From cff41c42ee84a48378f630aa60694235537a5c0a Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 13 Aug 2024 10:16:17 -0400 Subject: [PATCH 193/348] (fix): release notes --- docs/release-notes/0.11.0.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index f95071d92..eefb7d6a8 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -6,13 +6,9 @@ * Allow `axis` parameter of e.g. :func:`anndata.concat` to accept `'obs'` and `'var` {pr}`1244` {user}`flying-sheep` * Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold` * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` -* Add `should_check_uniqueness` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1507` {user}`ilan-gold` -* Add {func}`~anndata.experimental.read_elem_lazy` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` -* Add functionality to write from GPU {class}`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` -* Add functionality to :func:`~anndata.experimental.read_elem_as_dask` to handle backed dataframes as well as a :func:`~anndata.experimental.read_backed` to handle reading in as much of the {class}`~anndata.AnnData` as possible backed {pr}`1247` {user}`ilan-gold` +* Add functionality to {func}`~anndata.experimental.read_elem_lazy` to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_backed` to handle reading in as much of the {class}`~anndata.AnnData` as possible backed {pr}`1469` {pr}`1247` {user}`ilan-gold` * Add {attr}`~anndata.settings.should_remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {pr}`1340` {user}`ilan-gold` * Add {attr}`~anndata.settings.should_check_uniqueness` option to {attr}`anndata.settings` to override current behavior {pr}`1507` {user}`ilan-gold` -* Add {func}`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` * Add functionality to write from GPU {class}`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` * Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {pr}`1474` {user}` falexwolf` From 3ccbfafd7b226ebd981e86c2afecb3f27fe757b6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 13 Aug 2024 10:27:10 -0400 Subject: [PATCH 194/348] (fix): `zarr` doc in `read_backed` --- src/anndata/experimental/backed/_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 988e615da..11d94cd00 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -25,9 +25,9 @@ def read_backed( """Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. No array data should need to be read into memory with the exception of :class:`awkward.Array` and some older-encoding string arrays. - Params - ------ - store: A store-like object to be read in. If :doc:`zarr:index`, it is best for it to be consolidated. + Parameters + ---------- + store: A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. Returns ------- From ff4d487bbcfc8f878b56844560c1a1ed65661080 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 13 Aug 2024 10:29:44 -0400 Subject: [PATCH 195/348] (fix): docs string --- src/anndata/experimental/backed/_io.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 11d94cd00..a4dfa0275 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -22,16 +22,18 @@ def read_backed( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, ) -> AnnData: - """Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. + """ + Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. No array data should need to be read into memory with the exception of :class:`awkward.Array` and some older-encoding string arrays. Parameters ---------- - store: A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. + store + A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. Returns ------- - A lazily read-in AnnData object. + A lazily read-in :class:`~anndata.AnnData` object. """ is_h5 = False if isinstance(store, Path) or isinstance(store, str): From ed8fedfe1aee36539591a03580ddfa973d1d7d11 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 13 Aug 2024 10:32:18 -0400 Subject: [PATCH 196/348] (fix): wording in release note --- docs/release-notes/0.11.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index eefb7d6a8..db735a8a5 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -6,7 +6,7 @@ * Allow `axis` parameter of e.g. :func:`anndata.concat` to accept `'obs'` and `'var` {pr}`1244` {user}`flying-sheep` * Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold` * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` -* Add functionality to {func}`~anndata.experimental.read_elem_lazy` to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_backed` to handle reading in as much of the {class}`~anndata.AnnData` as possible backed {pr}`1469` {pr}`1247` {user}`ilan-gold` +* Add {func}`~anndata.experimental.read_elem_lazy` to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_backed` to handle reading in as much of the {class}`~anndata.AnnData` as possible backed {pr}`1469` {pr}`1247` {user}`ilan-gold` * Add {attr}`~anndata.settings.should_remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {pr}`1340` {user}`ilan-gold` * Add {attr}`~anndata.settings.should_check_uniqueness` option to {attr}`anndata.settings` to override current behavior {pr}`1507` {user}`ilan-gold` * Add functionality to write from GPU {class}`dask.array.Array` to disk {pr}`1550` {user}`ilan-gold` From 3325f38323f96be0f43a5ebd222869d3a4fd01f7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:16:51 +0200 Subject: [PATCH 197/348] (chore): move `_remove_unused_categories` to static method --- src/anndata/_core/anndata.py | 56 +++++++++++----------- src/anndata/experimental/backed/_xarray.py | 4 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 4c46b9b31..fae60bc38 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -79,32 +79,6 @@ def _check_2d_shape(X): ) -@singledispatch -def _remove_unused_categories( - df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any] -): - for k in df_full: - if not isinstance(df_full[k].dtype, pd.CategoricalDtype): - continue - all_categories = df_full[k].cat.categories - with pd.option_context("mode.chained_assignment", None): - df_sub[k] = df_sub[k].cat.remove_unused_categories() - # also correct the colors... - color_key = f"{k}_colors" - if color_key not in uns: - continue - color_vec = uns[color_key] - if np.array(color_vec).ndim == 0: - # Make 0D arrays into 1D ones - uns[color_key] = np.array(color_vec)[(None,)] - elif len(color_vec) != len(all_categories): - # Reset colors - del uns[color_key] - else: - idx = np.where(np.isin(all_categories, df_sub[k].cat.categories))[0] - uns[color_key] = np.array(color_vec)[(idx,)] - - class AnnData(metaclass=utils.DeprecationMixinMeta): """\ An annotated data matrix. @@ -314,8 +288,8 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): # fix categories uns = copy(adata_ref._uns) if settings.should_remove_unused_categories: - _remove_unused_categories(adata_ref.obs, obs_sub, uns) - _remove_unused_categories(adata_ref.var, var_sub, uns) + self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = as_view(obs_sub, view_args=(self, "obs")) self._var = as_view(var_sub, view_args=(self, "var")) @@ -1038,6 +1012,32 @@ def __getitem__(self, index: Index) -> AnnData: oidx, vidx = self._normalize_indices(index) return AnnData(self, oidx=oidx, vidx=vidx, asview=True) + @staticmethod + @singledispatch + def _remove_unused_categories( + df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any] + ): + for k in df_full: + if not isinstance(df_full[k].dtype, pd.CategoricalDtype): + continue + all_categories = df_full[k].cat.categories + with pd.option_context("mode.chained_assignment", None): + df_sub[k] = df_sub[k].cat.remove_unused_categories() + # also correct the colors... + color_key = f"{k}_colors" + if color_key not in uns: + continue + color_vec = uns[color_key] + if np.array(color_vec).ndim == 0: + # Make 0D arrays into 1D ones + uns[color_key] = np.array(color_vec)[(None,)] + elif len(color_vec) != len(all_categories): + # Reset colors + del uns[color_key] + else: + idx = np.where(np.isin(all_categories, df_sub[k].cat.categories))[0] + uns[color_key] = np.array(color_vec)[(idx,)] + def rename_categories(self, key: str, categories: Sequence[Any]): """\ Rename categories of annotation `key` in :attr:`obs`, :attr:`var`, diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 0e464549e..e5191b847 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -4,7 +4,7 @@ import pandas as pd -from ..._core.anndata import _gen_dataframe, _remove_unused_categories +from ..._core.anndata import AnnData, _gen_dataframe from ..._core.file_backing import to_memory from ..._core.index import _subset from ..._core.views import as_view @@ -96,7 +96,7 @@ def _gen_dataframe_xr( return anno -@_remove_unused_categories.register(Dataset2D) +@AnnData._remove_unused_categories.register(Dataset2D) def _remove_unused_categories_xr( df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any] ): From 528026fd8272eea3894dc4bc679a35a07c99e2f0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:17:10 +0200 Subject: [PATCH 198/348] (chore): use one `isinstance` call in `coerce_arrays` --- src/anndata/_core/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index f280ecd33..ca03bf842 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -88,7 +88,7 @@ def __repr__(): return value # If value is one of the allowed types, return it - if isinstance(value, StorageType.classes()) or isinstance(value, Dataset2D): + if isinstance(value, (*StorageType.classes(), Dataset2D)): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warnings.warn(msg, ImplicitModificationWarning) From aa0d1610eec11cc61508068f52e3441f14ba31ef Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:26:49 +0200 Subject: [PATCH 199/348] (chore): clean up `read_dataframe` --- src/anndata/_io/specs/lazy_methods.py | 73 ++++++++++++++++----------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 25310d592..b9c8c62a3 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -16,9 +16,10 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable, Generator, Mapping, Sequence + from collections.abc import Callable, Generator, Iterator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar + from anndata.experimental.backed._compat import xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from anndata.experimental.backed._xarray import Dataset2D @@ -184,49 +185,61 @@ def read_zarr_array( return da.from_zarr(elem, chunks=chunks) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) -def read_dataframe( - elem: H5Group | ZarrGroup, - *, - _reader: LazyReader, - chunks: tuple[int, ...] | None = None, -) -> Dataset2D: +def _gen_xarray_dict_itetator_from_elems( + elem_dict: dict[str, (DaskArray | Dataset2D | CategoricalArray | MaskedArray)], + index_label: str, + index_key: str, + index: ArrayStorageType, +) -> Iterator[tuple[str, xr.DataArray]]: from anndata.experimental.backed._compat import xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray - from anndata.experimental.backed._xarray import Dataset2D - iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ - (elem.attrs["_index"], elem[elem.attrs["_index"]]) - ] - d = {k: _reader.read_elem(v) for k, v in iter_object} - d_with_xr = {} - elem_name = get_elem_name(elem) - index_label = f'{elem_name.replace("/", "")}_names' - index = d[elem.attrs["_index"]] # no sense in reading this in multiple times - for k in d: - v = d[k] - if isinstance(v, DaskArray) and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray(v, coords=[index], dims=[index_label], name=k) - elif ( - isinstance(v, CategoricalArray) or isinstance(v, MaskedArray) - ) and k != elem.attrs["_index"]: + for k, v in elem_dict.items(): + data_array_name = k + if isinstance(v, DaskArray) and k != index_key: + data_array = xr.DataArray(v, coords=[index], dims=[index_label], name=k) + elif isinstance(v, (CategoricalArray, MaskedArray)) and k != index_key: variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] ) - d_with_xr[k] = xr.DataArray( + data_array = xr.DataArray( variable, coords=[index], dims=[index_label], name=k, ) - elif k == elem.attrs["_index"]: - d_with_xr[index_label] = xr.DataArray( + elif k == index_key: + data_array = xr.DataArray( v, coords=[v], dims=[index_label], name=index_label ) + data_array_name = index_label else: - d_with_xr[k] = v - return Dataset2D(d_with_xr) + raise ValueError(f"Could not read {k}: {v} from into xarray Dataset2D") + yield data_array_name, data_array + + +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) +def read_dataframe( + elem: H5Group | ZarrGroup, + *, + _reader: LazyReader, + chunks: tuple[int, ...] | None = None, +) -> Dataset2D: + from anndata.experimental.backed._xarray import Dataset2D + + elem_dict = { + k: _reader.read_elem(elem[k]) + for k in [*elem.attrs["column-order"], elem.attrs["_index"]] + } + elem_name = get_elem_name(elem) + index_label = f'{elem_name.replace("/", "")}_names' + index_key = elem.attrs["_index"] + index = elem_dict[index_key] # no sense in reading this in multiple times + elem_xarray_dict = dict( + _gen_xarray_dict_itetator_from_elems(elem_dict, index_label, index_key, index) + ) + return Dataset2D(elem_xarray_dict) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) From 41e3038ec38cfce2a9542bb77fb44f9846b9eba0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:30:23 +0200 Subject: [PATCH 200/348] (chore): handle case where `chunks` is not needed --- src/anndata/_io/specs/lazy_methods.py | 3 --- src/anndata/_io/specs/registry.py | 5 ++++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b9c8c62a3..faf6589e5 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -224,7 +224,6 @@ def read_dataframe( elem: H5Group | ZarrGroup, *, _reader: LazyReader, - chunks: tuple[int, ...] | None = None, ) -> Dataset2D: from anndata.experimental.backed._xarray import Dataset2D @@ -248,7 +247,6 @@ def read_categorical( elem: H5Group | ZarrGroup, *, _reader: LazyReader, - chunks: tuple[int, ...] | None = None, ) -> CategoricalArray: from anndata.experimental.backed._lazy_arrays import CategoricalArray @@ -264,7 +262,6 @@ def read_nullable( *, encoding_type: str, _reader: LazyReader, - chunks: tuple[int, ...] | None = None, ) -> MaskedArray: from anndata.experimental.backed._lazy_arrays import MaskedArray diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 72347dd7f..f959bf505 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -1,5 +1,6 @@ from __future__ import annotations +import inspect import warnings from collections.abc import Mapping from dataclasses import dataclass @@ -301,7 +302,9 @@ def read_elem( if self.callback is not None: msg = "Dask reading does not use a callback. Ignoring callback." warnings.warn(msg, stacklevel=2) - return read_func(elem, chunks=chunks) + if "chunks" in inspect.signature(read_func).parameters: + return read_func(elem, chunks=chunks) + return read_func(elem) class Writer: From dc5c6e637d4cb1d6fa4740f7b30c3b27d1a1fa0b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:40:11 +0200 Subject: [PATCH 201/348] (chore): make reusable `LazyDataStructures` --- src/anndata/_io/specs/lazy_methods.py | 4 ++-- src/anndata/_io/specs/registry.py | 3 ++- src/anndata/_types.py | 7 +++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index faf6589e5..e98034199 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -25,7 +25,7 @@ from ..._core.sparse_dataset import CSCDataset, CSRDataset from ..._types import ArrayStorageType, StorageType - from .registry import DaskReader, LazyReader + from .registry import DaskReader, LazyDataStructures, LazyReader BlockInfo = Mapping[ Literal[None], @@ -186,7 +186,7 @@ def read_zarr_array( def _gen_xarray_dict_itetator_from_elems( - elem_dict: dict[str, (DaskArray | Dataset2D | CategoricalArray | MaskedArray)], + elem_dict: dict[str, LazyDataStructures], index_label: str, index_key: str, index: ArrayStorageType, diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f959bf505..3db44f0c3 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -30,6 +30,7 @@ T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) + LazyDataStructures = DaskArray | Dataset2D | CategoricalArray | MaskedArray # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" @@ -292,7 +293,7 @@ def read_elem( elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, - ) -> DaskArray | Dataset2D | CategoricalArray | MaskedArray: + ) -> LazyDataStructures: """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 2fdb8c03e..07470a496 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -31,8 +31,7 @@ from collections.abc import Mapping from typing import Any, TypeAlias - from anndata._io.specs.registry import LazyReader - from anndata.experimental.backed._xarray import Dataset2D + from anndata._io.specs.registry import LazyDataStructures, LazyReader from ._io.specs.registry import IOSpec, Reader, Writer @@ -93,7 +92,7 @@ def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantInMemoryType: ... class _ReadLazyInternal(Protocol[SCon]): def __call__( self, elem: SCon, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None - ) -> DaskArray | Dataset2D: ... + ) -> LazyDataStructures: ... class Read(Protocol[SCon, CovariantInMemoryType]): @@ -114,7 +113,7 @@ def __call__(self, elem: SCon) -> CovariantInMemoryType: class ReadLazy(Protocol[SCon]): def __call__( self, elem: SCon, *, chunks: tuple[int, ...] | None = None - ) -> DaskArray | Dataset2D: + ) -> LazyDataStructures: """Low-level reading function for a lazy element. Parameters From 4edd279638ad9bd94dadfbfa770358c894e06a93 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:46:31 +0200 Subject: [PATCH 202/348] (chore): use `Path.suffix` --- src/anndata/experimental/backed/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index a4dfa0275..bca5f89a9 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -38,7 +38,7 @@ def read_backed( is_h5 = False if isinstance(store, Path) or isinstance(store, str): store = str(store) - if store.endswith("h5ad"): + if Path(store).suffix == ".h5ad": is_h5 = True has_keys = True # true if consolidated or h5ad From 969c6afebc5a7927e34decbbf18def4d9743b036 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:46:42 +0200 Subject: [PATCH 203/348] (chore): `msg` for `warnings` --- src/anndata/experimental/backed/_io.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index bca5f89a9..1a69c67a5 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -48,9 +48,8 @@ def read_backed( try: f = zarr.open_consolidated(store, mode="r") except KeyError: - warnings.warn( - "Did not read zarr as consolidated. Consider consolidating your metadata." - ) + msg = "Did not read zarr as consolidated. Consider consolidating your metadata." + warnings.warn(msg) has_keys = False f = zarr.open(store, mode="r") else: From 2a31ab8fc0a17e4544c716feb7494ea09ce7c1d3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 14:46:48 +0200 Subject: [PATCH 204/348] (chore): remove erroneous `Union` in `TypeVar` --- src/anndata/experimental/backed/_lazy_arrays.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 0b1504e8a..b71040921 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import singledispatchmethod -from typing import TYPE_CHECKING, Generic, TypeVar, Union +from typing import TYPE_CHECKING, Generic, TypeVar import pandas as pd @@ -16,7 +16,7 @@ from anndata._core.index import Index -K = TypeVar("K", bound=Union[H5Array, ZarrArray]) +K = TypeVar("K", H5Array, ZarrArray) class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): From 2521ff8cc274b4fad7ae8fb60603ec9072f626b7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 15:28:02 +0200 Subject: [PATCH 205/348] (fix): use `cached_property` for accessing `dtype` + test --- .../experimental/backed/_lazy_arrays.py | 37 +++---------------- src/anndata/tests/helpers.py | 1 + tests/test_read_backed_experimental.py | 21 +++++++++-- 3 files changed, 23 insertions(+), 36 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index b71040921..23fd9c4e0 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -1,6 +1,6 @@ from __future__ import annotations -from functools import singledispatchmethod +from functools import cached_property, singledispatchmethod from typing import TYPE_CHECKING, Generic, TypeVar import pandas as pd @@ -43,34 +43,6 @@ def __getitem__(self, key): ) -# Prevents first access from having to load the categories array -class CategoricalDtypeAccessor: - def __init__(self, categories: ZarrArray | H5Array, ordered: bool): - self._categories = categories - self._ordered = ordered - self._dtype = None - - def __get__(self, obj, objtype=None): - return self.dtype - - @property - def dtype(self): - if self._dtype is None: - self._dtype = pd.CategoricalDtype( - categories=self._categories, ordered=self._ordered - ) - return self._dtype - - def __getattr__(self, name: str): - return getattr(self.dtype, name) - - def __repr__(self): - return repr(self.dtype) - - def __str__(self) -> str: - return str(self.dtype) - - class CategoricalArray(BackendArray): def __init__( self, @@ -85,9 +57,6 @@ def __init__( self._categories_cache = None self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) self.shape = self._codes.shape - self.dtype = CategoricalDtypeAccessor( - categories=self._categories, ordered=self._ordered - ) @property def categories(self): # __slots__ and cached_property are incompatible @@ -115,6 +84,10 @@ def __getitem__( ) return xr.core.extension_array.PandasExtensionArray(categorical_array) + @cached_property + def dtype(self): + return pd.CategoricalDtype(categories=self._categories, ordered=self._ordered) + class MaskedArray(BackendArray): def __init__( diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 18cf24ad4..fd3501273 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1003,6 +1003,7 @@ def initialize_key_trackers(self, keys_to_track): for k in keys_to_track: self._access_count[k] = 0 self._accessed_keys[k] = [] + self._accessed[k] = set() def reset_key_trackers(self): self.initialize_key_trackers(self._access_count.keys()) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index cf772b557..891bc12de 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -60,17 +60,30 @@ def test_access_count_obs_var(tmp_path, mtx_format): orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) remote = read_backed(store) - store.initialize_key_trackers(["obs/cat/codes", "obs/int64", "var/int64", "X"]) + store.initialize_key_trackers( + ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X"] + ) # a series of methods that should __not__ read in any data remote.X # the initial (non-subset) access to `X` should not read in data remote.shape remote.var remote.obs remote.obs["int64"] - remote.var["int64"] - assert store.get_access_count("obs/cat/codes") == 0, store.get_subkeys_accessed( - "obs/cat/codes" + remote.obs["int64"] + remote.obs["cat"] + assert store.get_access_count("obs/int64") == 0, store.get_subkeys_accessed( + "obs/int64" ) + assert ( + store.get_access_count("obs/cat/categories") == 0 + ), store.get_subkeys_accessed("obs/cat/categories") + # This should only cause categories to be read in once + remote.obs["cat"].dtype + remote.obs["cat"].dtype + remote.obs["cat"].dtype + assert ( + store.get_access_count("obs/cat/categories") == 1 + ), store.get_subkeys_accessed("obs/cat/categories") subset = remote[ (remote.obs["cat"] == "a").data, : ] # `.data` for xarray, but should we handle internally? From 628f9fc0942724178ce7a058678a04920a18c67e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:15:38 +0200 Subject: [PATCH 206/348] (refactor): use `cached_property` for `categories` --- .../experimental/backed/_lazy_arrays.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 23fd9c4e0..2374cd3e7 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -58,18 +58,15 @@ def __init__( self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) self.shape = self._codes.shape - @property - def categories(self): # __slots__ and cached_property are incompatible - if self._categories_cache is None: - if isinstance(self._categories, ZarrArray): - self._categories_cache = self._categories[...] - else: - if ( - "read_dataset" not in dir() - ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 - from ..._io.h5ad import read_dataset - self._categories_cache = read_dataset(self._categories) - return self._categories_cache + @cached_property + def categories(self): + if isinstance(self._categories, ZarrArray): + return self._categories[...] + if ( + "read_dataset" not in dir() + ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 + from ..._io.h5ad import read_dataset + return read_dataset(self._categories) def __getitem__( self, key: xr.core.indexing.ExplicitIndexer From ff9412aeefea7653153843723178518bc306b347 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:17:58 +0200 Subject: [PATCH 207/348] (refactor): use guard clause in `__getitem__` better --- src/anndata/experimental/backed/_lazy_arrays.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 2374cd3e7..5eb926f09 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -76,9 +76,7 @@ def __getitem__( codes=codes, categories=self.categories, ordered=self._ordered ) if settings.should_remove_unused_categories: - return xr.core.extension_array.PandasExtensionArray( - categorical_array.remove_unused_categories() - ) + categorical_array = categorical_array.remove_unused_categories() return xr.core.extension_array.PandasExtensionArray(categorical_array) @cached_property From 36d57befad15eaa4dfeaa782c932cd21d13b2f66 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:31:38 +0200 Subject: [PATCH 208/348] (chore): type `get_index_dim` --- src/anndata/experimental/backed/_xarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index e5191b847..197113772 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -11,14 +11,14 @@ from ._compat import Dataset if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Hashable, Iterable from typing import Any, Literal from ..._core.index import Index from ._compat import DataArray -def get_index_dim(ds): +def get_index_dim(ds: DataArray) -> Hashable: assert ( len(ds.sizes) == 1 ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" From 51610b1a1bb7c9635b66d05d3a6338179b002113 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:31:46 +0200 Subject: [PATCH 209/348] (fix): `shape` return type --- src/anndata/experimental/backed/_xarray.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 197113772..90fba4668 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -38,16 +38,14 @@ def index(self) -> pd.Index: return pd.Index(self.coords[coord].data) @property - def shape( - self, - ): + def shape(self) -> tuple[int, int]: """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability Returns ------- The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. """ - return [self.sizes[get_index_dim(self)], len(self)] + return (self.sizes[get_index_dim(self)], len(self)) @property def iloc(self): From ba8d1470c8ad7c522a2ada24afe8fa0e232cd1c9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:33:48 +0200 Subject: [PATCH 210/348] (refactor): `_subset` guard clause --- src/anndata/experimental/backed/_xarray.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 90fba4668..c792b4deb 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -70,10 +70,9 @@ def __getitem__(self, idx): @_subset.register(Dataset2D) def _(a: DataArray, subset_idx: Index): key = get_index_dim(a) - if ( - isinstance(subset_idx, tuple) and len(subset_idx) == 1 - ): # xarray seems to have some code looking for a second entry in tuples - return a.isel(**{key: subset_idx[0]}) + # xarray seems to have some code looking for a second entry in tuples + if isinstance(subset_idx, tuple) and len(subset_idx) == 1: + subset_idx = subset_idx[0] return a.isel(**{key: subset_idx}) From 2cf126224cfa2509f3afb72f5b950b2f57de8740 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:34:46 +0200 Subject: [PATCH 211/348] (fix): use `Counter` --- src/anndata/tests/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index fd3501273..68db0be97 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -3,7 +3,7 @@ import random import re import warnings -from collections import defaultdict +from collections import Counter, defaultdict from collections.abc import Mapping from contextlib import contextmanager from functools import partial, singledispatch, wraps @@ -970,7 +970,7 @@ def shares_memory_sparse(x, y): class AccessTrackingStore(zarr.DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._access_count = defaultdict(int) + self._access_count = Counter() self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) From b1feb6f3b060a7691eb1d3d507caa71579ad4ef6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 27 Aug 2024 16:48:44 +0200 Subject: [PATCH 212/348] (refactor): `fix_known_differences` usage of `as_type` --- tests/test_concatenate.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index d7b3f4767..607b5b613 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -149,12 +149,11 @@ def fix_known_differences(orig, result, backwards_compat=True): result.obs.drop(columns=["batch"], inplace=True) # Possibly need to fix this, ordered categoricals lose orderedness - for k, dtype in orig.obs.dtypes.items(): - if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: - result.obs[k] = result.obs[k].astype(dtype) - for k, dtype in orig.obsm["df"].dtypes.items(): - if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: - result.obsm["df"][k] = result.obsm["df"][k].astype(dtype) + for get_df in [lambda k: getattr(k, "obs"), lambda k: getattr(k, "obsm")["df"]]: + str_to_df_converted = get_df(result) + for k, dtype in get_df(orig).dtypes.items(): + if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: + str_to_df_converted[k] = str_to_df_converted[k].astype(dtype) return orig, result From ab3e7185e4d873a650eef0bbae42d7f5cd38721e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 28 Aug 2024 14:26:12 +0200 Subject: [PATCH 213/348] (chore): fragment --- docs/release-notes/1247.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/1247.feature.md diff --git a/docs/release-notes/1247.feature.md b/docs/release-notes/1247.feature.md new file mode 100644 index 000000000..439c5a9d6 --- /dev/null +++ b/docs/release-notes/1247.feature.md @@ -0,0 +1 @@ +Add {func}`~anndata.experimental.read_elem_lazy` (in place of `read_elem_as_dask`) to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_backed` to handle reading in as much of the on-disk data as possible to produce a {class}`~anndata.AnnData` object {user}`ilan-gold` From 4412710007dd077e03bb1176f99fe3b36b742d46 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 28 Aug 2024 14:36:01 +0200 Subject: [PATCH 214/348] (chore): fix the generic problem --- src/anndata/experimental/backed/_lazy_arrays.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 5eb926f09..374ec5d7d 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -1,6 +1,6 @@ from __future__ import annotations -from functools import cached_property, singledispatchmethod +from functools import cached_property from typing import TYPE_CHECKING, Generic, TypeVar import pandas as pd @@ -20,12 +20,9 @@ class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): - @singledispatchmethod # type: ignore - def __init__(self, array: ZarrArray): - return super().__init__(array) - - @__init__.register - def _(self, array: H5Array): + def __init__(self, array: K): + if isinstance(array, ZarrArray): + return super().__init__(array) self._array = array self.shape = self._array.shape self.dtype = self._array.dtype @@ -91,8 +88,8 @@ def __init__( dtype_str: str, mask: ZarrArray | H5Array | None = None, ): - self._mask = ZarrOrHDF5Wrapper[type(mask)](mask) - self._values = ZarrOrHDF5Wrapper[type(values)](values) + self._mask = ZarrOrHDF5Wrapper(mask) + self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str self.shape = self._values.shape self.dtype = pd.api.types.pandas_dtype(self._values.dtype) From d3401b2e5c649f23f9fbb1857c6a847b4df71bcd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 28 Aug 2024 16:16:13 +0200 Subject: [PATCH 215/348] (chore): clean up tests --- tests/test_read_backed_experimental.py | 110 +++++++++++++------------ 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 891bc12de..50405d051 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -22,30 +22,31 @@ @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], ids=["scipy-csr", "scipy-csc", "np-array", "dask_array"], + scope="session", ) def mtx_format(request): return request.param -@pytest.fixture(params=[sparse.csr_matrix, sparse.csc_matrix]) -def sparse_format(request): - return request.param - - -@pytest.fixture(params=["zarr", "h5ad"]) +@pytest.fixture(params=["zarr", "h5ad"], scope="session") def dskfmt(request): return request.param -needs_xarray = pytest.mark.skipif( - not find_spec("xarray"), reason="Xarray is not installed" -) +@pytest.fixture(scope="session") +def adata_remote_orig( + tmp_path_factory, dskfmt: str, mtx_format +) -> tuple[AnnData, AnnData]: + orig_path = Path(tmp_path_factory.mktemp(f"orig.{dskfmt}")) + orig = gen_adata((1000, 1000), mtx_format) + orig.write_zarr(orig_path) + remote = read_backed(orig_path) + return remote, orig -@needs_xarray -def test_access_count_obs_var(tmp_path, mtx_format): - base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" +@pytest.fixture +def adata_remote_with_store_tall_skinny(tmp_path_factory, mtx_format): + orig_path = Path(tmp_path_factory.mktemp("orig.zarr")) M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) @@ -57,9 +58,20 @@ def test_access_count_obs_var(tmp_path, mtx_format): var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), ) - orig.write_zarr(orig_pth) - store = AccessTrackingStore(orig_pth) + orig.write_zarr(orig_path) + store = AccessTrackingStore(orig_path) remote = read_backed(store) + return remote, store + + +needs_xarray = pytest.mark.skipif( + not find_spec("xarray"), reason="Xarray is not installed" +) + + +@needs_xarray +def test_access_count_obs_var(adata_remote_with_store_tall_skinny): + remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers( ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X"] ) @@ -77,13 +89,6 @@ def test_access_count_obs_var(tmp_path, mtx_format): assert ( store.get_access_count("obs/cat/categories") == 0 ), store.get_subkeys_accessed("obs/cat/categories") - # This should only cause categories to be read in once - remote.obs["cat"].dtype - remote.obs["cat"].dtype - remote.obs["cat"].dtype - assert ( - store.get_access_count("obs/cat/categories") == 1 - ), store.get_subkeys_accessed("obs/cat/categories") subset = remote[ (remote.obs["cat"] == "a").data, : ] # `.data` for xarray, but should we handle internally? @@ -112,42 +117,43 @@ def test_access_count_obs_var(tmp_path, mtx_format): @needs_xarray -def test_to_memory(tmp_path, mtx_format, dskfmt): - adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / f"orig.{dskfmt}" - write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) - write(adata) - remote = read_backed(orig_pth) +def test_access_count_dtype(adata_remote_with_store_tall_skinny): + remote, store = adata_remote_with_store_tall_skinny + store.initialize_key_trackers(["obs/cat/categories"]) + assert ( + store.get_access_count("obs/cat/categories") == 0 + ), store.get_subkeys_accessed("obs/cat/categories") + # This should only cause categories to be read in once + remote.obs["cat"].dtype + remote.obs["cat"].dtype + remote.obs["cat"].dtype + assert ( + store.get_access_count("obs/cat/categories") == 1 + ), store.get_subkeys_accessed("obs/cat/categories") + + +@needs_xarray +def test_to_memory(adata_remote_orig): + remote, orig = adata_remote_orig remote_to_memory = remote.to_memory() - assert_equal(remote_to_memory, adata) + assert_equal(remote_to_memory, orig) @needs_xarray -def test_view_to_memory(tmp_path, mtx_format, dskfmt): - adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / f"orig.{dskfmt}" - write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) - write(adata) - remote = read_backed(orig_pth) - subset_obs = adata.obs["obs_cat"] == "a" - assert_equal(adata[subset_obs, :], remote[subset_obs, :].to_memory()) +def test_view_to_memory(adata_remote_orig): + remote, orig = adata_remote_orig + subset_obs = orig.obs["obs_cat"] == "a" + assert_equal(orig[subset_obs, :], remote[subset_obs, :].to_memory()) - subset_var = adata.var["var_cat"] == "a" - assert_equal(adata[:, subset_var], remote[:, subset_var].to_memory()) + subset_var = orig.var["var_cat"] == "a" + assert_equal(orig[:, subset_var], remote[:, subset_var].to_memory()) @needs_xarray -def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): - adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / f"orig.{dskfmt}" - write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) - write(adata) - remote = read_backed(orig_pth) - subset_obs = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") - subsetted_adata = adata[subset_obs, :] +def test_view_of_view_to_memory(adata_remote_orig): + remote, orig = adata_remote_orig + subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") + subsetted_adata = orig[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] assert_equal( @@ -155,8 +161,8 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): remote[subset_obs, :][subset_subset_obs, :].to_memory(), ) - subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") - subsetted_adata = adata[:, subset_var] + subset_var = (orig.var["var_cat"] == "a") | (orig.var["var_cat"] == "b") + subsetted_adata = orig[:, subset_var] subset_subset_var = subsetted_adata.var["var_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] assert_equal( From 3b6d19458ae108b66687d7d52eb88bb248c93bc1 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 30 Aug 2024 11:12:25 +0200 Subject: [PATCH 216/348] Update tests/test_read_backed_experimental.py Co-authored-by: Philipp A. --- tests/test_read_backed_experimental.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 50405d051..de5a57c9e 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -102,7 +102,7 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( "var/int64" ) - # all codes read in for subset + # all codes read in for subset (from 4 chunks) assert store.get_access_count("obs/cat/codes") == 4, store.get_subkeys_accessed( "obs/cat/codes" ) From 97eace50ea146c4ff76af903f953a23b532a686c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 2 Sep 2024 16:31:56 +0200 Subject: [PATCH 217/348] (fix): should -> shall --- src/anndata/_core/views.py | 2 +- src/anndata/experimental/backed/_lazy_arrays.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_core/views.py b/src/anndata/_core/views.py index f40ccbf1c..7aafe3c30 100644 --- a/src/anndata/_core/views.py +++ b/src/anndata/_core/views.py @@ -306,7 +306,7 @@ def as_view_dask_array(array, view_args): @as_view.register(pd.DataFrame) def as_view_df(df, view_args): - if settings.should_remove_unused_categories: + if settings.shall_remove_unused_categories: for col in df.columns: if isinstance(df[col].dtype, pd.CategoricalDtype): with pd.option_context("mode.chained_assignment", None): diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 374ec5d7d..b5544f403 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -72,7 +72,7 @@ def __getitem__( categorical_array = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self._ordered ) - if settings.should_remove_unused_categories: + if settings.shall_remove_unused_categories: categorical_array = categorical_array.remove_unused_categories() return xr.core.extension_array.PandasExtensionArray(categorical_array) From 58654a8107e8c7038ab118cc04d5a179c16e4265 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 2 Sep 2024 16:33:12 +0200 Subject: [PATCH 218/348] Apply suggestions from code review Co-authored-by: Philipp A. --- src/anndata/_io/specs/lazy_methods.py | 2 +- src/anndata/_io/specs/registry.py | 2 +- src/anndata/experimental/backed/_io.py | 6 +----- src/anndata/experimental/backed/_lazy_arrays.py | 6 ++---- tests/test_read_backed_experimental.py | 12 +++++------- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e98034199..6aab4ce00 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -185,7 +185,7 @@ def read_zarr_array( return da.from_zarr(elem, chunks=chunks) -def _gen_xarray_dict_itetator_from_elems( +def _gen_xarray_dict_iterator_from_elems( elem_dict: dict[str, LazyDataStructures], index_label: str, index_key: str, diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 67dfa83f6..0218c6758 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -386,7 +386,7 @@ def read_elem(elem: StorageType) -> InMemoryElem: def read_elem_lazy( elem: StorageType, chunks: tuple[int, ...] | None = None -) -> DaskArray | Dataset2D | CategoricalArray | MaskedArray: +) -> LazyDataStructures: """ Read an element from a store lazily. diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 1a69c67a5..ed28868ff 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -35,11 +35,7 @@ def read_backed( ------- A lazily read-in :class:`~anndata.AnnData` object. """ - is_h5 = False - if isinstance(store, Path) or isinstance(store, str): - store = str(store) - if Path(store).suffix == ".h5ad": - is_h5 = True + is_h5 = isinstance(store, (Path, str)) and Path(store).suffix == ".h5ad" has_keys = True # true if consolidated or h5ad if not is_h5: diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index b5544f403..72dddc180 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -59,10 +59,8 @@ def __init__( def categories(self): if isinstance(self._categories, ZarrArray): return self._categories[...] - if ( - "read_dataset" not in dir() - ): # avoid circular dependency, not sure what caused this all of a sudden after merging https://github.com/scverse/anndata/pull/949/commits/dc9f12fcbca977841e967c8414b9f1032e069250 - from ..._io.h5ad import read_dataset + from ..._io.h5ad import read_dataset + return read_dataset(self._categories) def __getitem__( diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index de5a57c9e..80a6d7912 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -37,7 +37,7 @@ def dskfmt(request): def adata_remote_orig( tmp_path_factory, dskfmt: str, mtx_format ) -> tuple[AnnData, AnnData]: - orig_path = Path(tmp_path_factory.mktemp(f"orig.{dskfmt}")) + orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_path) remote = read_backed(orig_path) @@ -46,7 +46,7 @@ def adata_remote_orig( @pytest.fixture def adata_remote_with_store_tall_skinny(tmp_path_factory, mtx_format): - orig_path = Path(tmp_path_factory.mktemp("orig.zarr")) + orig_path = tmp_path_factory.mktemp("orig.zarr") M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) @@ -174,11 +174,9 @@ def test_view_of_view_to_memory(adata_remote_orig): @needs_xarray def test_unconsolidated(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" - write = lambda x: getattr(x, "write_zarr")(orig_pth) - write(adata) - (Path(orig_pth) / ".zmetadata").unlink() + orig_pth = tmp_pth / "orig.zarr" + adata.write_zarr(orig_pth) + (orig_pth / ".zmetadata").unlink() store = AccessTrackingStore(orig_pth) store.initialize_key_trackers(["obs/.zgroup", ".zgroup"]) with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"): From 67af64f2b7d20ca590150864b020e22ddcf99a4a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 2 Sep 2024 16:50:48 +0200 Subject: [PATCH 219/348] (fix): `_gen_xarray_dict_iterator_from_elems` -> `_gen_xarray_dict_iterator_from_elems` --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6aab4ce00..48feea9b4 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -236,7 +236,7 @@ def read_dataframe( index_key = elem.attrs["_index"] index = elem_dict[index_key] # no sense in reading this in multiple times elem_xarray_dict = dict( - _gen_xarray_dict_itetator_from_elems(elem_dict, index_label, index_key, index) + _gen_xarray_dict_iterator_from_elems(elem_dict, index_label, index_key, index) ) return Dataset2D(elem_xarray_dict) From bfc2e739ac79305a0cdf7c7d8a45db4177a9dbf2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 2 Sep 2024 16:53:27 +0200 Subject: [PATCH 220/348] (feat): indexing with `DataArray` --- src/anndata/_core/index.py | 4 ++++ tests/test_read_backed_experimental.py | 7 ++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 6a5e2fc39..552f08ad6 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -50,6 +50,8 @@ def _normalize_index( | pd.Index, index: pd.Index, ) -> slice | int | np.ndarray: # ndarray of int or bool + from ..experimental.backed._compat import xr + if not isinstance(index, pd.RangeIndex): msg = "Don’t call _normalize_index with non-categorical/string names" assert index.dtype != float, msg @@ -107,6 +109,8 @@ def name_idx(i): "are not valid obs/ var names or indices." ) return positions # np.ndarray[int] + elif isinstance(indexer, xr.DataArray): + return indexer.data else: raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}") diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 80a6d7912..16a9f3333 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -1,7 +1,6 @@ from __future__ import annotations from importlib.util import find_spec -from pathlib import Path import numpy as np import pandas as pd @@ -89,9 +88,7 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): assert ( store.get_access_count("obs/cat/categories") == 0 ), store.get_subkeys_accessed("obs/cat/categories") - subset = remote[ - (remote.obs["cat"] == "a").data, : - ] # `.data` for xarray, but should we handle internally? + subset = remote[remote.obs["cat"] == "a", :] subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] @@ -174,7 +171,7 @@ def test_view_of_view_to_memory(adata_remote_orig): @needs_xarray def test_unconsolidated(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) - orig_pth = tmp_pth / "orig.zarr" + orig_pth = tmp_path / "orig.zarr" adata.write_zarr(orig_pth) (orig_pth / ".zmetadata").unlink() store = AccessTrackingStore(orig_pth) From a1d0b892cda15ce1db0e14483de8eff7679ee8c2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 14:29:59 +0200 Subject: [PATCH 221/348] (fix): check h5 store --- src/anndata/experimental/backed/_io.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index ed28868ff..b0e4ff0a0 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -35,7 +35,10 @@ def read_backed( ------- A lazily read-in :class:`~anndata.AnnData` object. """ - is_h5 = isinstance(store, (Path, str)) and Path(store).suffix == ".h5ad" + is_h5_store = isinstance(store, (h5py.Dataset, h5py.File)) + is_h5 = ( + isinstance(store, (Path, str)) and Path(store).suffix == ".h5ad" + ) or is_h5_store has_keys = True # true if consolidated or h5ad if not is_h5: @@ -49,7 +52,10 @@ def read_backed( has_keys = False f = zarr.open(store, mode="r") else: - f = h5py.File(store, mode="r") + if is_h5_store: + f = store + else: + f = h5py.File(store, mode="r") def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): From 5f80b61c6982bb4892f214f4709467567b495f5b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 16:11:09 +0200 Subject: [PATCH 222/348] (fix):check `DataArray` closer --- docs/tutorials/notebooks | 2 +- src/anndata/_core/index.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9e186c5c6..23e4b18db 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 +Subproject commit 23e4b18db81a5cb593a28301909128bfe7ef592d diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 552f08ad6..b17d8f1d7 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -110,6 +110,8 @@ def name_idx(i): ) return positions # np.ndarray[int] elif isinstance(indexer, xr.DataArray): + if isinstance(indexer.data, DaskArray): + return indexer.data.compute() return indexer.data else: raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}") From 8ce640935d1f05b05f49648fa8e30bf7a2c5933c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 16:23:50 +0200 Subject: [PATCH 223/348] (fix): clean up `api.md` from merge --- docs/api.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/api.md b/docs/api.md index 4c2cbaa48..0cee16d44 100644 --- a/docs/api.md +++ b/docs/api.md @@ -131,8 +131,6 @@ Low level methods for reading and writing elements of an {class}`AnnData` object .. autosummary:: :toctree: generated/ - experimental.read_elem - experimental.write_elem experimental.read_elem_lazy experimental.read_backed ``` From f9ef9f025dd5213565cda24ef746feae0125c66c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 16:30:16 +0200 Subject: [PATCH 224/348] (fix): remove `read_elem_as_dask` docs reference --- docs/release-notes/0.11.0rc1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.11.0rc1.md b/docs/release-notes/0.11.0rc1.md index 36eb354de..a394e940a 100644 --- a/docs/release-notes/0.11.0rc1.md +++ b/docs/release-notes/0.11.0rc1.md @@ -21,7 +21,7 @@ - Allow `axis` parameter of e.g. :func:`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) - Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) - Add {attr}`~anndata.settings.shall_remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) -- Add :func:`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) +- Add :func:`~anndata.experimental.read_elem_lazy` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) - Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) - Add {attr}`~anndata.settings.shall_check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) - Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) From 5e69a50afb407f58911b4e51ec73e9d7fe8c9b14 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 16:31:17 +0200 Subject: [PATCH 225/348] (chore): add notebooks/read_backed_experimental --- docs/tutorials/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index f62e7967c..9fb5f3283 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -14,4 +14,5 @@ notebooks/anncollection-annloader notebooks/anndata_dask_array notebooks/awkward-arrays notebooks/{read,write}_dispatched +notebooks/read_backed_experimental ``` From 1540d2784933400837d18020ea0909548ff821bc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Sep 2024 16:34:33 +0200 Subject: [PATCH 226/348] (chore): update notebooks --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 23e4b18db..2fd5b13e2 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 23e4b18db81a5cb593a28301909128bfe7ef592d +Subproject commit 2fd5b13e259c83e95dd9330cc3a93cb54d66e42c From 371fc2beba275584a46b8bf12c7d1a7874c2b890 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:12:37 +0200 Subject: [PATCH 227/348] (refactor): set `pytestmark` at the top --- tests/test_read_backed_experimental.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 16a9f3333..8b8112b93 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -63,12 +63,11 @@ def adata_remote_with_store_tall_skinny(tmp_path_factory, mtx_format): return remote, store -needs_xarray = pytest.mark.skipif( +pytestmark = pytest.mark.skipif( not find_spec("xarray"), reason="Xarray is not installed" ) -@needs_xarray def test_access_count_obs_var(adata_remote_with_store_tall_skinny): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers( @@ -113,7 +112,6 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): ) # never accessed -@needs_xarray def test_access_count_dtype(adata_remote_with_store_tall_skinny): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/cat/categories"]) @@ -129,14 +127,12 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): ), store.get_subkeys_accessed("obs/cat/categories") -@needs_xarray def test_to_memory(adata_remote_orig): remote, orig = adata_remote_orig remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) -@needs_xarray def test_view_to_memory(adata_remote_orig): remote, orig = adata_remote_orig subset_obs = orig.obs["obs_cat"] == "a" @@ -146,7 +142,6 @@ def test_view_to_memory(adata_remote_orig): assert_equal(orig[:, subset_var], remote[:, subset_var].to_memory()) -@needs_xarray def test_view_of_view_to_memory(adata_remote_orig): remote, orig = adata_remote_orig subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") @@ -168,7 +163,6 @@ def test_view_of_view_to_memory(adata_remote_orig): ) -@needs_xarray def test_unconsolidated(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) orig_pth = tmp_path / "orig.zarr" From 5cb2d8d8f40889df8cfa7912843a037a8aee848b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:18:34 +0200 Subject: [PATCH 228/348] (chore): clarify comment --- tests/test_read_backed_experimental.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 8b8112b93..dffc8f561 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -91,6 +91,7 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] + sub_subset.var["int64"] assert store.get_access_count("X") == 0, store.get_subkeys_accessed("X") assert store.get_access_count("obs/int64") == 0, store.get_subkeys_accessed( "obs/int64" @@ -106,10 +107,10 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): assert store.get_access_count("obs/int64") == 1, store.get_subkeys_accessed( "obs/int64" ) - # one for 0, .zmetadata handles .zarray + # .zmetadata handles .zarray so simple access does not cause any read assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( "var/int64" - ) # never accessed + ) def test_access_count_dtype(adata_remote_with_store_tall_skinny): From b86ee6b10005d7546a0b2c981efcfd636746caad Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:26:54 +0200 Subject: [PATCH 229/348] (refactor): add `assert_access_count` method for `AccessTrackingStore` --- src/anndata/tests/helpers.py | 13 +++++--- tests/test_read_backed_experimental.py | 42 +++++++------------------- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index ca1644ebb..0a91b10db 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1048,7 +1048,7 @@ def __init__(self, *args, **kwargs): self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) - def __getitem__(self, key): + def __getitem__(self, key: str): for tracked in self._access_count: if tracked in key: self._access_count[tracked] += 1 @@ -1056,24 +1056,24 @@ def __getitem__(self, key): self._accessed_keys[tracked] += [key] return super().__getitem__(key) - def get_access_count(self, key): + def get_access_count(self, key: str) -> int: # access defaultdict when value is not there causes key to be there, # which causes it to be tracked if key not in self._access_count: raise ValueError(f"{key} not found among access count") return self._access_count[key] - def get_subkeys_accessed(self, key): + def get_subkeys_accessed(self, key: str) -> set[str]: if key not in self._accessed: raise ValueError(f"{key} not found among accessed") return self._accessed[key] - def get_accessed_keys(self, key): + def get_accessed_keys(self, key: str) -> list[str]: if key not in self._accessed_keys: raise ValueError(f"{key} not found among accessed keys") return self._accessed_keys[key] - def initialize_key_trackers(self, keys_to_track): + def initialize_key_trackers(self, keys_to_track: Collection[str]): for k in keys_to_track: self._access_count[k] = 0 self._accessed_keys[k] = [] @@ -1082,6 +1082,9 @@ def initialize_key_trackers(self, keys_to_track): def reset_key_trackers(self): self.initialize_key_trackers(self._access_count.keys()) + def assert_access_count(self, key: str, count: int): + assert self.get_access_count(key) == count, self.get_subkeys_accessed(key) + except ImportError: class AccessTrackingStore: diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index dffc8f561..b22cacc3a 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -81,51 +81,33 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): remote.obs["int64"] remote.obs["int64"] remote.obs["cat"] - assert store.get_access_count("obs/int64") == 0, store.get_subkeys_accessed( - "obs/int64" - ) - assert ( - store.get_access_count("obs/cat/categories") == 0 - ), store.get_subkeys_accessed("obs/cat/categories") + store.assert_access_count("obs/int64", 0) + store.assert_access_count("obs/cat/categories", 0) subset = remote[remote.obs["cat"] == "a", :] subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] sub_subset.var["int64"] - assert store.get_access_count("X") == 0, store.get_subkeys_accessed("X") - assert store.get_access_count("obs/int64") == 0, store.get_subkeys_accessed( - "obs/int64" - ) - assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( - "var/int64" - ) + store.assert_access_count("X", 0) + store.assert_access_count("obs/int64", 0) + store.assert_access_count("var/int64", 0) # all codes read in for subset (from 4 chunks) - assert store.get_access_count("obs/cat/codes") == 4, store.get_subkeys_accessed( - "obs/cat/codes" - ) + store.assert_access_count("obs/cat/codes", 4) remote[0:10, :].obs["int64"][0:10].compute() - assert store.get_access_count("obs/int64") == 1, store.get_subkeys_accessed( - "obs/int64" - ) + store.assert_access_count("obs/int64", 1) # .zmetadata handles .zarray so simple access does not cause any read - assert store.get_access_count("var/int64") == 0, store.get_subkeys_accessed( - "var/int64" - ) + store.assert_access_count("var/int64", 0) def test_access_count_dtype(adata_remote_with_store_tall_skinny): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/cat/categories"]) - assert ( - store.get_access_count("obs/cat/categories") == 0 - ), store.get_subkeys_accessed("obs/cat/categories") + store.assert_access_count("obs/cat/categories", 0) # This should only cause categories to be read in once remote.obs["cat"].dtype remote.obs["cat"].dtype remote.obs["cat"].dtype - assert ( - store.get_access_count("obs/cat/categories") == 1 - ), store.get_subkeys_accessed("obs/cat/categories") + store.assert_access_count("obs/cat/categories", 1) def test_to_memory(adata_remote_orig): @@ -175,6 +157,4 @@ def test_unconsolidated(tmp_path, mtx_format): remote = read_backed(store) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) - assert store.get_access_count("obs/.zgroup") == 1, store.get_subkeys_accessed( - "obs/.zgroup" - ) + store.assert_access_count("obs/.zgroup", 1) From 227a3c668372a39371dad0a22a624a22ed3d42d4 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:28:55 +0200 Subject: [PATCH 230/348] (refactor): `read_backed`->`read_lazy` --- src/anndata/experimental/backed/_io.py | 2 +- tests/test_read_backed_experimental.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index b0e4ff0a0..b339290e3 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -19,7 +19,7 @@ from ...compat import ZarrGroup -def read_backed( +def read_lazy( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, ) -> AnnData: """ diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index b22cacc3a..9b6a6867b 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -8,7 +8,7 @@ from scipy import sparse from anndata._core.anndata import AnnData -from anndata.experimental import read_backed +from anndata.experimental import read_lazy from anndata.tests.helpers import ( AccessTrackingStore, as_dense_dask_array, @@ -39,7 +39,7 @@ def adata_remote_orig( orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_path) - remote = read_backed(orig_path) + remote = read_lazy(orig_path) return remote, orig @@ -59,7 +59,7 @@ def adata_remote_with_store_tall_skinny(tmp_path_factory, mtx_format): ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) - remote = read_backed(store) + remote = read_lazy(store) return remote, store @@ -154,7 +154,7 @@ def test_unconsolidated(tmp_path, mtx_format): store = AccessTrackingStore(orig_pth) store.initialize_key_trackers(["obs/.zgroup", ".zgroup"]) with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"): - remote = read_backed(store) + remote = read_lazy(store) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) store.assert_access_count("obs/.zgroup", 1) From 3debf9b2c166de4ae470b940297c86a4c1b62b98 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:31:38 +0200 Subject: [PATCH 231/348] (fix): actually `read_backed` -> `read_lazy` --- docs/api.md | 2 +- docs/release-notes/1247.feature.md | 2 +- src/anndata/experimental/__init__.py | 4 ++-- src/anndata/experimental/backed/__init__.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/api.md b/docs/api.md index 0cee16d44..8f565e9ed 100644 --- a/docs/api.md +++ b/docs/api.md @@ -132,7 +132,7 @@ Low level methods for reading and writing elements of an {class}`AnnData` object :toctree: generated/ experimental.read_elem_lazy - experimental.read_backed + experimental.read_lazy ``` Utilities for customizing the IO process: diff --git a/docs/release-notes/1247.feature.md b/docs/release-notes/1247.feature.md index 439c5a9d6..c19ccf9fa 100644 --- a/docs/release-notes/1247.feature.md +++ b/docs/release-notes/1247.feature.md @@ -1 +1 @@ -Add {func}`~anndata.experimental.read_elem_lazy` (in place of `read_elem_as_dask`) to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_backed` to handle reading in as much of the on-disk data as possible to produce a {class}`~anndata.AnnData` object {user}`ilan-gold` +Add {func}`~anndata.experimental.read_elem_lazy` (in place of `read_elem_as_dask`) to handle backed dataframes, sparse arrays, and dense arrays, as well as a {func}`~anndata.experimental.read_lazy` to handle reading in as much of the on-disk data as possible to produce a {class}`~anndata.AnnData` object {user}`ilan-gold` diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 40f715539..4015747ab 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -9,7 +9,7 @@ from .._io.specs import IOSpec, read_elem_lazy from .._types import Read, ReadCallback, StorageType, Write, WriteCallback from ._dispatch_io import read_dispatched, write_dispatched -from .backed import read_backed +from .backed import read_lazy from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader @@ -61,7 +61,7 @@ def __getattr__(attr_name: str) -> Any: "IOSpec", "concat_on_disk", "Read", - "read_backed", + "read_lazy", "Write", "ReadCallback", "WriteCallback", diff --git a/src/anndata/experimental/backed/__init__.py b/src/anndata/experimental/backed/__init__.py index 4239f2293..9c8acba50 100644 --- a/src/anndata/experimental/backed/__init__.py +++ b/src/anndata/experimental/backed/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations -from ._io import read_backed +from ._io import read_lazy -__all__ = ["read_backed"] +__all__ = ["read_lazy"] From 4b489880d9d34a94035a8644dcd714e71c31d85d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:35:28 +0200 Subject: [PATCH 232/348] (chore): time to require `aiohttp` `fsspec` and `zarr` and `requests` --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8f6748281..77f23fc15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,7 @@ dev-test = ["pytest-xdist"] # local test speedups gpu = ["cupy"] cu12 = ["cupy-cuda12x"] cu11 = ["cupy-cuda11x"] -xarray = ["xarray>=2024.06.0"] +xarray = ["xarray>=2024.06.0", "aiohttp", "requests", "zarr<3.0.0a0"] [tool.hatch.version] source = "vcs" From 28c95a7fc19f5f4ce2c6b9977ba32157692644a7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:37:19 +0200 Subject: [PATCH 233/348] Merge branch 'main' into ig/xarray_compat --- .pre-commit-config.yaml | 2 +- ci/scripts/min-deps.py | 2 +- ci/scripts/towncrier_automation.py | 36 ++---- docs/api.md | 1 + docs/conf.py | 11 +- .../patch_sphinx_toolbox_autoprotocol.py | 43 ------- docs/extensions/release_notes.py | 111 ------------------ docs/release-notes/1677.dev.md | 1 + pyproject.toml | 14 ++- src/anndata/abc.py | 17 ++- src/anndata/typing.py | 7 +- 11 files changed, 48 insertions(+), 197 deletions(-) delete mode 100644 docs/extensions/patch_sphinx_toolbox_autoprotocol.py delete mode 100644 docs/extensions/release_notes.py create mode 100644 docs/release-notes/1677.dev.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a3b108bc9..9bb40c2e0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.3 + rev: v0.6.5 hooks: - id: ruff types_or: [python, pyi, jupyter] diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index b5b0b980e..c6bac0cf4 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -1,4 +1,4 @@ -#!python3 +#!/usr/bin/env python3 from __future__ import annotations import argparse diff --git a/ci/scripts/towncrier_automation.py b/ci/scripts/towncrier_automation.py index a126a028f..8fd96e3ef 100755 --- a/ci/scripts/towncrier_automation.py +++ b/ci/scripts/towncrier_automation.py @@ -1,11 +1,11 @@ -#!python3 +#!/usr/bin/env python3 from __future__ import annotations import argparse import subprocess from typing import TYPE_CHECKING -from packaging import version +from packaging.version import Version if TYPE_CHECKING: from collections.abc import Sequence @@ -40,11 +40,10 @@ def parse_args(argv: Sequence[str] | None = None) -> Args: action="store_true", ) args = parser.parse_args(argv, Args()) - if len(version.Version(args.version).release) != 3: - raise ValueError( - f"Version argument {args.version} must contain major, minor, and patch version." - ) - version.parse(args.version) # validate + # validate the version + if len(Version(args.version).release) != 3: + msg = f"Version argument {args.version} must contain major, minor, and patch version." + raise ValueError(msg) return args @@ -77,14 +76,7 @@ def main(argv: Sequence[str] | None = None) -> None: # push if not args.dry_run: subprocess.run( - [ - "git", - "push", - "--set-upstream", - "origin", - branch_name, - ], - check=True, + ["git", "push", "--set-upstream", "origin", branch_name], check=True ) else: print("Dry run, not pushing") @@ -95,15 +87,11 @@ def main(argv: Sequence[str] | None = None) -> None: "gh", "pr", "create", - "--base", - base_branch, - "--title", - pr_title, - "--body", - pr_description, - "--label", - "skip-gpu-ci", - *(["--label", "no milestone"] if base_branch == "main" else []), + f"--base={base_branch}", + f"--title={pr_title}", + f"--body={pr_description}", + "--label=skip-gpu-ci", + *(["--label=no milestone"] if base_branch == "main" else []), *(["--dry-run"] if args.dry_run else []), ], check=True, diff --git a/docs/api.md b/docs/api.md index 8f565e9ed..e2ae7bb22 100644 --- a/docs/api.md +++ b/docs/api.md @@ -189,6 +189,7 @@ Types used by the former: abc.CSRDataset abc.CSCDataset + typing.Index typing.AxisStorable typing.RWAble ``` diff --git a/docs/conf.py b/docs/conf.py index c2188ab1f..4919036ab 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,13 +62,8 @@ "sphinx.ext.linkcode", "nbsphinx", "IPython.sphinxext.ipython_console_highlighting", - "patch_sphinx_toolbox_autoprotocol", # internal extension "sphinx_toolbox.more_autodoc.autoprotocol", - *( # other internal extensions - p.stem - for p in _extension_dir.glob("*.py") - if p.stem != "patch_sphinx_toolbox_autoprotocol" - ), + *(p.stem for p in _extension_dir.glob("*.py")), ] myst_enable_extensions = [ "html_image", # So README.md can be used on github and sphinx docs @@ -116,6 +111,8 @@ ("py:class", "awkward.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), + # https://github.com/sphinx-doc/sphinx/issues/10974 + ("py:class", "numpy.int64"), ] @@ -134,7 +131,7 @@ def setup(app: Sphinx): scipy=("https://docs.scipy.org/doc/scipy/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), - xarray=("https://xarray.pydata.org/en/stable/", None), + xarray=("https://docs.xarray.dev/en/stable/", None), dask=("https://docs.dask.org/en/stable/", None), ) qualname_overrides = { diff --git a/docs/extensions/patch_sphinx_toolbox_autoprotocol.py b/docs/extensions/patch_sphinx_toolbox_autoprotocol.py deleted file mode 100644 index bafe24cc4..000000000 --- a/docs/extensions/patch_sphinx_toolbox_autoprotocol.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from sphinx.ext.autodoc import ObjectMember -from sphinx_toolbox.more_autodoc.autoprotocol import ProtocolDocumenter - -if TYPE_CHECKING: - from typing import Self - - from sphinx.application import Sphinx - - -def patch_sphinx_toolbox_autoprotocol(): - """Compat hack: https://github.com/sphinx-toolbox/sphinx-toolbox/issues/168""" - - class ObjectMemberCompat(ObjectMember): - @classmethod - def from_other(cls, other: ObjectMember) -> Self: - return cls( - other.__name__, - other.object, - docstring=other.docstring, - class_=other.class_, - skipped=other.skipped, - ) - - def __iter__(self): - return iter([self.__name__, self.object]) - - filter_orig = ProtocolDocumenter.filter_members - - def filter_members( - self, members: list[ObjectMember], want_all: bool - ) -> list[tuple[str, object, bool]]: - member_tuples = [ObjectMemberCompat.from_other(m) for m in members] - return filter_orig(self, member_tuples, want_all) - - ProtocolDocumenter.filter_members = filter_members - - -def setup(_app: Sphinx) -> None: - patch_sphinx_toolbox_autoprotocol() diff --git a/docs/extensions/release_notes.py b/docs/extensions/release_notes.py deleted file mode 100644 index f6166c748..000000000 --- a/docs/extensions/release_notes.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -import itertools -import re -from pathlib import Path -from typing import TYPE_CHECKING - -from docutils import nodes -from packaging.version import Version -from sphinx.util.docutils import SphinxDirective - -if TYPE_CHECKING: - from collections.abc import Iterable, Sequence - from typing import ClassVar - - from myst_parser.mdit_to_docutils.base import DocutilsRenderer - from sphinx.application import Sphinx - - -FULL_VERSION_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:\.dev.*)?$") - - -class ReleaseNotes(SphinxDirective): - required_arguments: ClassVar = 1 - - def run(self) -> Sequence[nodes.Node]: - dir_ = Path(self.arguments[0]) - # resolve relative dir - if not dir_.is_absolute(): - src_file = Path(self.get_source_info()[0]) - if not src_file.is_file(): - msg = f"Cannot find relative path to: {src_file}" - raise self.error(msg) - dir_ = src_file.parent / self.arguments[0] - if not dir_.is_dir(): - msg = f"Not a directory: {dir_}" - raise self.error(msg) - - versions = sorted( - ( - (Version(f.stem), f) - for f in dir_.iterdir() - if FULL_VERSION_RE.match(f.stem) - ), - reverse=True, # descending - ) - version_groups = itertools.groupby( - versions, key=lambda vf: (vf[0].major, vf[0].minor) - ) - for (major, minor), versions in version_groups: - self.render_version_group(major, minor, versions) - return [] - - def render_version_group( - self, major: int, minor: int, versions: Iterable[tuple[Version, Path]] - ) -> None: - target = nodes.target( - ids=[f"v{major}-{minor}"], - names=[f"v{major}.{minor}"], - ) - section = nodes.section( - "", - nodes.title("", f"Version {major}.{minor}"), - ids=[], - names=[f"version {major}.{minor}"], - ) - self.state.document.note_implicit_target(section) - self.state.document.note_explicit_target(target) - # append target and section to parent - self.renderer.current_node.append(target) - self.renderer.update_section_level_state(section, 2) - # append children to section - with self.renderer.current_node_context(section): - for _, p in versions: - self.render_include(p) - - def render_include(self, path: Path) -> None: - # hacky solution because of https://github.com/executablebooks/MyST-Parser/issues/967 - from docutils.parsers.rst.directives.misc import Include - from myst_parser.mocking import MockIncludeDirective - - srcfile, lineno = self.get_source_info() - parent_dir = Path(srcfile).parent - - d = MockIncludeDirective( - renderer=self.renderer, - name=type(self).__name__, - klass=Include, # type: ignore # wrong type hint - arguments=[str(path.relative_to(parent_dir))], - options={}, - body=[], - lineno=lineno, - ) - d.run() - - # TODO: replace the above with this once the above mentioned bug is fixed - # from sphinx.util.parsing import nested_parse_to_nodes - # return nested_parse_to_nodes( - # self.state, - # path.read_text(), - # source=str(path), - # offset=self.content_offset, - # ) - - @property - def renderer(self) -> DocutilsRenderer: - return self.state._renderer - - -def setup(app: Sphinx) -> None: - app.add_directive("release-notes", ReleaseNotes) diff --git a/docs/release-notes/1677.dev.md b/docs/release-notes/1677.dev.md new file mode 100644 index 000000000..858a6ff79 --- /dev/null +++ b/docs/release-notes/1677.dev.md @@ -0,0 +1 @@ +Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` diff --git a/pyproject.toml b/pyproject.toml index 77f23fc15..3f01d5796 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,10 +70,10 @@ doc = [ "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", + "sphinx-toolbox>=3.8.0", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.6", + "scanpydoc[theme,typehints] >=0.14.1", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks @@ -84,7 +84,7 @@ doc = [ "anndata[dev-doc]", ] dev-doc = ["towncrier>=24.8.0"] # release notes tool -test-full = ["anndata[test]", "anndata[xarray]"] +test-full = ["anndata[test]", "anndata[lazy]"] test = [ "loompy>=3.0.5", "pytest>=8.2", @@ -97,22 +97,26 @@ test = [ "boltons", "scanpy", "httpx", # For data downloading - "dask[array,distributed]>=2022.09.2,<2024.8.0", + "dask[distributed]", "awkward>=2.3", "pyarrow", "pytest_memray", "pytest-mock", + "anndata[dask]", ] dev-test = ["pytest-xdist"] # local test speedups gpu = ["cupy"] cu12 = ["cupy-cuda12x"] cu11 = ["cupy-cuda11x"] -xarray = ["xarray>=2024.06.0", "aiohttp", "requests", "zarr<3.0.0a0"] +# https://github.com/dask/dask/issues/11290 +dask = ["dask[array]>=2022.09.2,<2024.8.0"] +lazy = ["xarray>=2024.06.0", "aiohttp", "requests", "zarr<3.0.0a0", "anndata[dask]"] [tool.hatch.version] source = "vcs" [tool.hatch.build.hooks.vcs] version-file = "src/anndata/_version.py" +raw-options.version_scheme = "release-branch-semver" [tool.hatch.build.targets.wheel] packages = ["src/anndata", "src/testing"] diff --git a/src/anndata/abc.py b/src/anndata/abc.py index cb55dc65f..df8c8a6e8 100644 --- a/src/anndata/abc.py +++ b/src/anndata/abc.py @@ -9,7 +9,7 @@ import numpy as np from scipy.sparse import csc_matrix, csr_matrix - from .compat import SpArray + from .compat import Index, SpArray __all__ = ["CSRDataset", "CSCDataset"] @@ -30,10 +30,19 @@ class _AbstractCSDataset(ABC): backend: Literal["zarr", "hdf5"] """Which file type is used on-disk.""" - # TODO: index type @abstractmethod - def __getitem__(self, index) -> float | csr_matrix | csc_matrix | SpArray: - """Load a slice or an element from the sparse dataset into memory.""" + def __getitem__(self, index: Index) -> float | csr_matrix | csc_matrix | SpArray: + """Load a slice or an element from the sparse dataset into memory. + + Parameters + ---------- + index + Index to load. + + Returns + ------- + The desired data read off disk. + """ @abstractmethod def to_memory(self) -> csr_matrix | csc_matrix | SpArray: diff --git a/src/anndata/typing.py b/src/anndata/typing.py index e447fba0f..46f5b51c7 100644 --- a/src/anndata/typing.py +++ b/src/anndata/typing.py @@ -19,12 +19,17 @@ ZappyArray, ZarrArray, ) +from .compat import Index as _Index if TYPE_CHECKING: from typing import TypeAlias -__all__ = ["RWAble", "AxisStorable"] +__all__ = ["Index", "RWAble", "AxisStorable"] + + +Index = _Index +"""1D or 2D index an :class:`~anndata.AnnData` object can be sliced with.""" ArrayDataStructureType: TypeAlias = Union[ From cec633aa522cf4e53c785e1d8f15c8b8308d8978 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 18 Sep 2024 16:39:19 +0200 Subject: [PATCH 234/348] (chore): update notebook --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 2fd5b13e2..9e186c5c6 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 2fd5b13e259c83e95dd9330cc3a93cb54d66e42c +Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 From bf710d05b051d6ef8deead7ea47bd7ef978afba1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 20 Sep 2024 15:13:54 +0200 Subject: [PATCH 235/348] (fix): actually only read `index` once --- src/anndata/_io/specs/lazy_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 5e731e3e8..6e1affb76 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -189,7 +189,7 @@ def _gen_xarray_dict_iterator_from_elems( elem_dict: dict[str, LazyDataStructures], index_label: str, index_key: str, - index: ArrayStorageType, + index: np.NDArray, ) -> Iterator[tuple[str, xr.DataArray]]: from anndata.experimental.backed._compat import xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray @@ -234,7 +234,7 @@ def read_dataframe( elem_name = get_elem_name(elem) index_label = f'{elem_name.replace("/", "")}_names' index_key = elem.attrs["_index"] - index = elem_dict[index_key] # no sense in reading this in multiple times + index = elem_dict[index_key].compute() # no sense in reading this in multiple times elem_xarray_dict = dict( _gen_xarray_dict_iterator_from_elems(elem_dict, index_label, index_key, index) ) From 1dfebde76721a7dca58a356b13038695f4ba2b63 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 13:16:52 +0200 Subject: [PATCH 236/348] (chore): add `concat` test --- tests/test_read_backed_experimental.py | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 9b6a6867b..179e7d1be 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -7,7 +7,8 @@ import pytest from scipy import sparse -from anndata._core.anndata import AnnData +import anndata as ad +from anndata import AnnData from anndata.experimental import read_lazy from anndata.tests.helpers import ( AccessTrackingStore, @@ -158,3 +159,27 @@ def test_unconsolidated(tmp_path, mtx_format): remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) store.assert_access_count("obs/.zgroup", 1) + + +def test_concat(tmp_path): + adatas = [] + M = 1000 + N = 50 + for dataset_index in range(5): + orig_path = tmp_path / f"orig_{dataset_index}.zarr" + orig_path.mkdir() + obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) + var_names = pd.Index(f"gene_{dataset_index}_{i}" for i in range(N)) + obs = gen_typed_df(M, obs_names) + var = gen_typed_df(N, var_names) + orig = AnnData( + obs=obs, + var=var, + X=sparse.csr_matrix( + np.random.binomial(100, 0.005, (M, N)).astype(np.float32) + ), + ) + orig.write_zarr(orig_path) + store = AccessTrackingStore(orig_path) + adatas += read_lazy(store) + ad.concat(adatas) From d355ed03f39dd92270c440592cf80a1e1beebca1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 13:17:00 +0200 Subject: [PATCH 237/348] (feat): add `columns` compat --- src/anndata/experimental/backed/_xarray.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index c792b4deb..e36e7c9d6 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -66,6 +66,18 @@ def __getitem__(self, idx): return IlocGetter(self) + @property + def columns(self) -> pd.Index: + """ + :class:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.columns` so this ensures usability + + Returns + ------- + :class:`pandas.Index` that represents the "columns." + """ + columns_list = list(self.keys()) + return pd.Index(columns_list) + @_subset.register(Dataset2D) def _(a: DataArray, subset_idx: Index): From cfae08a23443dde6b53e38be44aa16d2dd1129f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 13:35:18 +0200 Subject: [PATCH 238/348] (fix): type of subset --- src/anndata/experimental/backed/_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index e36e7c9d6..d3996b247 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -80,7 +80,7 @@ def columns(self) -> pd.Index: @_subset.register(Dataset2D) -def _(a: DataArray, subset_idx: Index): +def _(a: Dataset2D, subset_idx: Index): key = get_index_dim(a) # xarray seems to have some code looking for a second entry in tuples if isinstance(subset_idx, tuple) and len(subset_idx) == 1: From 1c46ec6ccc8c2dc7f2874cb66de3ee01346fecb9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 14:21:14 +0200 Subject: [PATCH 239/348] (fix): `MaskedArray` `dtype --- src/anndata/experimental/backed/_lazy_arrays.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 72dddc180..115d62f67 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -90,7 +90,6 @@ def __init__( self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str self.shape = self._values.shape - self.dtype = pd.api.types.pandas_dtype(self._values.dtype) def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: values = self._values[key] @@ -107,6 +106,16 @@ def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: ) return xr.core.extension_array.PandasExtensionArray(pd.array(values)) + @cached_property + def dtype(self): + if self._dtype_str == "nullable-integer": + return pd.array( + [], + dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(), + ).dtype + elif self._dtype_str == "nullable-boolean": + return pd.BooleanDtype() + @_subset.register(DataArray) def _subset_masked(a: DataArray, subset_idx: Index): From f49172444cddd487f3d4a2aec78d61db46d6ad18 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 14:21:22 +0200 Subject: [PATCH 240/348] (fix): add `index.setter` --- src/anndata/experimental/backed/_xarray.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index d3996b247..b2f68f9d3 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -37,6 +37,11 @@ def index(self) -> pd.Index: coord = list(self.coords.keys())[0] return pd.Index(self.coords[coord].data) + @index.setter + def index(self, val) -> None: + coord = list(self.coords.keys())[0] + self.coords[coord] = val + @property def shape(self) -> tuple[int, int]: """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability From 411bd91577aec98eee13a2b06579d5e58227cca1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 14:21:39 +0200 Subject: [PATCH 241/348] (chore): add `concat` compat for xarray --- src/anndata/_core/merge.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index caf4236ef..485ace87b 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -41,6 +41,8 @@ from pandas.api.extensions import ExtensionDtype + from anndata.experimental.backed._xarray import Dataset2D + T = TypeVar("T") ################### @@ -225,7 +227,9 @@ def as_cp_sparse(x) -> CupySparseMatrix: return cpsparse.csr_matrix(x) -def unify_dtypes(dfs: Iterable[pd.DataFrame]) -> list[pd.DataFrame]: +def unify_dtypes( + dfs: Iterable[pd.DataFrame | Dataset2D], +) -> list[pd.DataFrame | Dataset2D]: """ Attempts to unify datatypes from multiple dataframes. @@ -1255,6 +1259,8 @@ def concat( >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ + from anndata.experimental.backed._compat import Dataset, xr + # Argument normalization merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) @@ -1300,11 +1306,25 @@ def concat( # Annotation for concatenation axis check_combinable_cols([getattr(a, axis_name).columns for a in adatas], join=join) - concat_annot = pd.concat( - unify_dtypes(getattr(a, axis_name) for a in adatas), - join=join, - ignore_index=True, + annotations = [getattr(a, axis_name) for a in adatas] + are_any_annotations_dataframes = any( + isinstance(a, pd.DataFrame) for a in annotations ) + are_annotations_mixed_type = are_any_annotations_dataframes and any( + isinstance(a, Dataset) for a in annotations + ) + if are_annotations_mixed_type: + annotations_in_memory = annotations.copy() + for i, a in enumerate(annotations): + annotations_in_memory[i] = a.to_pandas() if isinstance(a, Dataset) else a + if are_any_annotations_dataframes: + concat_annot = pd.concat( + unify_dtypes(a for a in annotations_in_memory), + join=join, + ignore_index=True, + ) + else: + concat_annot = xr.concat(annotations, join=join, dim=f"{axis_name}_names") concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col From 7f89eb3e504f9586dc5e3d6540f55f359d3218ce Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 14:27:23 +0200 Subject: [PATCH 242/348] (fix): refactor concat for in-memory --- src/anndata/_core/merge.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 485ace87b..f3e373071 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1313,11 +1313,13 @@ def concat( are_annotations_mixed_type = are_any_annotations_dataframes and any( isinstance(a, Dataset) for a in annotations ) - if are_annotations_mixed_type: - annotations_in_memory = annotations.copy() - for i, a in enumerate(annotations): - annotations_in_memory[i] = a.to_pandas() if isinstance(a, Dataset) else a if are_any_annotations_dataframes: + annotations_in_memory = annotations.copy() + if are_annotations_mixed_type: + for i, a in enumerate(annotations): + annotations_in_memory[i] = ( + a.to_pandas() if isinstance(a, Dataset) else a + ) concat_annot = pd.concat( unify_dtypes(a for a in annotations_in_memory), join=join, From d77ba37b96abc6f4fbf1300a53208a62039fbea0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 23 Sep 2024 14:27:45 +0200 Subject: [PATCH 243/348] (chore): add rest of test --- tests/test_read_backed_experimental.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 179e7d1be..c9209abe9 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -161,15 +161,19 @@ def test_unconsolidated(tmp_path, mtx_format): store.assert_access_count("obs/.zgroup", 1) -def test_concat(tmp_path): +def test_concat_simple(tmp_path): + from anndata.experimental.backed._compat import Dataset + + lazy_adatas = [] adatas = [] M = 1000 N = 50 - for dataset_index in range(5): + n_datasets = 2 + for dataset_index in range(n_datasets): orig_path = tmp_path / f"orig_{dataset_index}.zarr" orig_path.mkdir() obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) - var_names = pd.Index(f"gene_{dataset_index}_{i}" for i in range(N)) + var_names = pd.Index(f"gene_{i}" for i in range(N)) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) orig = AnnData( @@ -181,5 +185,11 @@ def test_concat(tmp_path): ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) - adatas += read_lazy(store) - ad.concat(adatas) + lazy_adatas += [read_lazy(store)] + adatas += [orig] + concated_remote = ad.concat(lazy_adatas) + assert isinstance(concated_remote.obs, Dataset) + df = ad.concat(adatas).obs + # account for differences + df.index.name = "obs_names" + assert_equal(concated_remote.obs.to_pandas(), df) From 096f2c671592306d46c7fee15a76a70ac817257c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 13:13:32 +0200 Subject: [PATCH 244/348] (feat): allow for concat with masked type using dask --- src/anndata/_core/merge.py | 52 ++++++++++++++++++++++++--- src/anndata/_io/specs/lazy_methods.py | 12 +++---- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index f3e373071..a1ead0e0c 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1259,7 +1259,11 @@ def concat( >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ - from anndata.experimental.backed._compat import Dataset, xr + import dask.array as da + + from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size + from anndata.experimental.backed._compat import xr + from anndata.experimental.backed._xarray import Dataset2D # Argument normalization merge = resolve_merge_strategy(merge) @@ -1311,14 +1315,14 @@ def concat( isinstance(a, pd.DataFrame) for a in annotations ) are_annotations_mixed_type = are_any_annotations_dataframes and any( - isinstance(a, Dataset) for a in annotations + isinstance(a, Dataset2D) for a in annotations ) if are_any_annotations_dataframes: annotations_in_memory = annotations.copy() if are_annotations_mixed_type: for i, a in enumerate(annotations): annotations_in_memory[i] = ( - a.to_pandas() if isinstance(a, Dataset) else a + a.to_pandas() if isinstance(a, Dataset2D) else a ) concat_annot = pd.concat( unify_dtypes(a for a in annotations_in_memory), @@ -1326,7 +1330,47 @@ def concat( ignore_index=True, ) else: - concat_annot = xr.concat(annotations, join=join, dim=f"{axis_name}_names") + axis_label = f"{axis_name}_names" + new_annotations = [] + + def make_dask_col(a, col): + new_col = a[col].copy() + + def get_chunk(block_info=None): + idx = tuple( + slice(start, stop) + for start, stop in block_info[None]["array-location"] + ) + return np.array(new_col.data[idx].array) + + # TODO: fix dtype + dtype = "object" + # TODO: get good chunk size? + return da.map_blocks( + get_chunk, + chunks=compute_chunk_layout_for_axis_size(1000, a.shape[0]), + meta=np.array([], dtype=dtype), + ) + + for a in annotations: + for col in a.columns: + if col != axis_label: + extension_cols = [] + if pd.api.types.is_extension_array_dtype(a[col]): + extension_cols += [col] + new_annotations += [ + a.copy( + data={ + **{col: make_dask_col(a, col) for col in extension_cols}, + **{ + col: a[col] + for col in a.columns + if col not in extension_cols + }, + } + ) + ] + concat_annot = xr.concat(new_annotations, join=join, dim=f"{axis_name}_names") concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6e1affb76..82bf76c36 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -53,11 +53,11 @@ def maybe_open_h5( _DEFAULT_STRIDE = 1000 -def compute_chunk_layout_for_axis_shape( - chunk_axis_shape: int, full_axis_shape: int +def compute_chunk_layout_for_axis_size( + chunk_axis_size: int, full_axis_size: int ) -> tuple[int, ...]: - n_strides, rest = np.divmod(full_axis_shape, chunk_axis_shape) - chunk = (chunk_axis_shape,) * n_strides + n_strides, rest = np.divmod(full_axis_size, chunk_axis_size) + chunk = (chunk_axis_size,) * n_strides if rest > 0: chunk += (rest,) return chunk @@ -116,7 +116,7 @@ def read_sparse_as_dask( stride = chunks[major_dim] shape_minor, shape_major = shape if is_csc else shape[::-1] - chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) + chunks_major = compute_chunk_layout_for_axis_size(stride, shape_major) chunks_minor = (shape_minor,) chunk_layout = ( (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) @@ -166,7 +166,7 @@ def read_h5_array( ) chunk_layout = tuple( - compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) + compute_chunk_layout_for_axis_size(chunks[i], shape[i]) for i in range(len(shape)) ) From 3c7c6278904076eb952dd4516d7d44f9991fe757 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 14:17:11 +0200 Subject: [PATCH 245/348] (refactor): own function for concat xarray --- src/anndata/_core/merge.py | 102 ++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index a1ead0e0c..521b9016f 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -43,6 +43,8 @@ from anndata.experimental.backed._xarray import Dataset2D + Join_T = Literal["inner", "outer"] + T = TypeVar("T") ################### @@ -306,7 +308,7 @@ def try_unifying_dtype( return None -def check_combinable_cols(cols: list[pd.Index], join: Literal["inner", "outer"]): +def check_combinable_cols(cols: list[pd.Index], join: Join_T): """Given columns for a set of dataframes, checks if the can be combined. Looks for if there are duplicated column names that would show up in the result. @@ -710,9 +712,7 @@ def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): return el[self.old_idx.get_indexer(self.new_idx)] -def merge_indices( - inds: Iterable[pd.Index], join: Literal["inner", "outer"] -) -> pd.Index: +def merge_indices(inds: Iterable[pd.Index], join: Join_T) -> pd.Index: if join == "inner": return reduce(lambda x, y: x.intersection(y), inds) elif join == "outer": @@ -1058,11 +1058,59 @@ def concat_Xs(adatas, reindexers, axis, fill_value): return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) +def concat_dataset2d_on_annot_axis( + annotations: Iterable[Dataset2D], + axis_label: Literal["obs_names", "var_names"], + join: Join_T, +): + import dask.array as da + + from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size + from anndata.experimental.backed._compat import xr + from anndata.experimental.backed._xarray import Dataset2D + + new_annotations = [] + + def make_dask_col(col: xr.DataArray): + new_col = col.copy() + + def get_chunk(block_info=None): + idx = tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + return np.array(new_col.data[idx].array) + + # TODO: fix dtype + dtype = "object" + # TODO: get good chunk size? + return da.map_blocks( + get_chunk, + chunks=compute_chunk_layout_for_axis_size(1000, a.shape[0]), + meta=np.array([], dtype=dtype), + ) + + for a in annotations: + for col in a.columns: + if col != axis_label: + extension_cols = [] + if pd.api.types.is_extension_array_dtype(a[col]): + extension_cols += [col] + new_annotations += [ + a.copy( + data={ + **{col: make_dask_col(a[col]) for col in extension_cols}, + **{col: a[col] for col in a.columns if col not in extension_cols}, + } + ) + ] + return Dataset2D(xr.concat(new_annotations, join=join, dim=axis_label)) + + def concat( adatas: Collection[AnnData] | typing.Mapping[str, AnnData], *, axis: Literal["obs", 0, "var", 1] = "obs", - join: Literal["inner", "outer"] = "inner", + join: Join_T = "inner", merge: StrategiesLiteral | Callable | None = None, uns_merge: StrategiesLiteral | Callable | None = None, label: str | None = None, @@ -1259,10 +1307,7 @@ def concat( >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ - import dask.array as da - from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size - from anndata.experimental.backed._compat import xr from anndata.experimental.backed._xarray import Dataset2D # Argument normalization @@ -1331,46 +1376,7 @@ def concat( ) else: axis_label = f"{axis_name}_names" - new_annotations = [] - - def make_dask_col(a, col): - new_col = a[col].copy() - - def get_chunk(block_info=None): - idx = tuple( - slice(start, stop) - for start, stop in block_info[None]["array-location"] - ) - return np.array(new_col.data[idx].array) - - # TODO: fix dtype - dtype = "object" - # TODO: get good chunk size? - return da.map_blocks( - get_chunk, - chunks=compute_chunk_layout_for_axis_size(1000, a.shape[0]), - meta=np.array([], dtype=dtype), - ) - - for a in annotations: - for col in a.columns: - if col != axis_label: - extension_cols = [] - if pd.api.types.is_extension_array_dtype(a[col]): - extension_cols += [col] - new_annotations += [ - a.copy( - data={ - **{col: make_dask_col(a, col) for col in extension_cols}, - **{ - col: a[col] - for col in a.columns - if col not in extension_cols - }, - } - ) - ] - concat_annot = xr.concat(new_annotations, join=join, dim=f"{axis_name}_names") + concat_annot = concat_dataset2d_on_annot_axis(annotations, axis_label, join) concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col From 74b1940654b7e7e292dd1f5da633b8eeda91fc43 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 14:48:14 +0200 Subject: [PATCH 246/348] (fix): add basic off-axis mapping without reading in i.e just an index --- src/anndata/_core/merge.py | 23 +++++++++++++++++---- tests/test_read_backed_experimental.py | 28 ++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 521b9016f..33677720e 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -10,7 +10,7 @@ from functools import partial, reduce, singledispatch from itertools import repeat from operator import and_, or_, sub -from typing import Literal, TypeVar +from typing import Literal, TypeVar, cast from warnings import warn import numpy as np @@ -1375,16 +1375,31 @@ def concat( ignore_index=True, ) else: - axis_label = f"{axis_name}_names" + axis_label = cast(Literal["obs_names", "var_names"], f"{axis_name}_names") concat_annot = concat_dataset2d_on_annot_axis(annotations, axis_label, join) concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col # Annotation for other axis - alt_annot = merge_dataframes( - [getattr(a, alt_axis_name) for a in adatas], alt_indices, merge + alt_annotations = [getattr(a, alt_axis_name) for a in adatas] + are_any_alt_annotations_dataframes = any( + isinstance(a, pd.DataFrame) for a in alt_annotations ) + are_alt_annotations_mixed_type = are_any_alt_annotations_dataframes and any( + isinstance(a, Dataset2D) for a in alt_annotations + ) + if are_any_alt_annotations_dataframes: + alt_annotations_in_memory = alt_annotations.copy() + if are_alt_annotations_mixed_type: + for i, a in enumerate(alt_annotations): + alt_annotations_in_memory[i] = ( + a.to_pandas() if isinstance(a, Dataset2D) else a + ) + alt_annot = merge_dataframes(alt_annotations, alt_indices, merge) + else: + # TODO: figure out off-axis mapping + alt_annot = pd.DataFrame(index=alt_indices) X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index c9209abe9..9ee0c6542 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -1,6 +1,7 @@ from __future__ import annotations from importlib.util import find_spec +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -18,6 +19,9 @@ gen_typed_df, ) +if TYPE_CHECKING: + from typing import Literal + @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], @@ -161,11 +165,13 @@ def test_unconsolidated(tmp_path, mtx_format): store.assert_access_count("obs/.zgroup", 1) -def test_concat_simple(tmp_path): +@pytest.mark.parametrize("join", ["outer", "inner"]) +def test_concat_simple(tmp_path, join: Literal["outer", "inner"]): from anndata.experimental.backed._compat import Dataset lazy_adatas = [] adatas = [] + stores: list[AccessTrackingStore] = [] M = 1000 N = 50 n_datasets = 2 @@ -185,11 +191,25 @@ def test_concat_simple(tmp_path): ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) + store.initialize_key_trackers(["obs/int64", "var/int64"]) lazy_adatas += [read_lazy(store)] adatas += [orig] - concated_remote = ad.concat(lazy_adatas) + stores += [store] + concated_remote = ad.concat(lazy_adatas, join=join) assert isinstance(concated_remote.obs, Dataset) - df = ad.concat(adatas).obs + for i in range(n_datasets): + stores[i].assert_access_count("obs/int64", 0) + stores[i].assert_access_count("var/int64", 0) + df = ad.concat(adatas, join=join).obs # account for differences + + # name is lost normally, should fix df.index.name = "obs_names" - assert_equal(concated_remote.obs.to_pandas(), df) + + # remote has object dtype, need to convert back for integers booleans etc. + remote_df = concated_remote.obs.to_pandas() + for col in df.columns: + dtype = df[col].dtype + if pd.api.types.is_extension_array_dtype(dtype): + remote_df[col] = remote_df[col].astype(dtype) + assert_equal(remote_df, df) From 04206b84b239355379b534715e3e3fb50711fd1a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 15:44:56 +0200 Subject: [PATCH 247/348] (feat): add merge for alt annot --- src/anndata/_core/merge.py | 83 +++++++++++++++++--------- tests/test_read_backed_experimental.py | 48 +++++++++++---- 2 files changed, 92 insertions(+), 39 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 33677720e..315b8687e 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1058,52 +1058,67 @@ def concat_Xs(adatas, reindexers, axis, fill_value): return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) -def concat_dataset2d_on_annot_axis( - annotations: Iterable[Dataset2D], - axis_label: Literal["obs_names", "var_names"], - join: Join_T, -): +def make_dask_col_from_extension_dtype(col): import dask.array as da from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size - from anndata.experimental.backed._compat import xr - from anndata.experimental.backed._xarray import Dataset2D - - new_annotations = [] - def make_dask_col(col: xr.DataArray): - new_col = col.copy() + new_col = col.copy() - def get_chunk(block_info=None): - idx = tuple( - slice(start, stop) for start, stop in block_info[None]["array-location"] - ) - return np.array(new_col.data[idx].array) - - # TODO: fix dtype - dtype = "object" - # TODO: get good chunk size? - return da.map_blocks( - get_chunk, - chunks=compute_chunk_layout_for_axis_size(1000, a.shape[0]), - meta=np.array([], dtype=dtype), + def get_chunk(block_info=None): + idx = tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] ) + return np.array(new_col.data[idx].array) + + # TODO: fix dtype + dtype = "object" + # TODO: get good chunk size? + return da.map_blocks( + get_chunk, + chunks=compute_chunk_layout_for_axis_size(1000, col.shape[0]), + meta=np.array([], dtype=dtype), + ) + + +def make_xarray_extension_dtypes_dask( + annotations: Iterable[Dataset2D], + axis_label: Literal["obs_names", "var_names"], +): + new_annotations = [] for a in annotations: + extension_cols = [] for col in a.columns: if col != axis_label: - extension_cols = [] if pd.api.types.is_extension_array_dtype(a[col]): extension_cols += [col] new_annotations += [ a.copy( data={ - **{col: make_dask_col(a[col]) for col in extension_cols}, + **{ + col: make_dask_col_from_extension_dtype(a[col]) + for col in extension_cols + }, **{col: a[col] for col in a.columns if col not in extension_cols}, } ) ] - return Dataset2D(xr.concat(new_annotations, join=join, dim=axis_label)) + return new_annotations + + +def concat_dataset2d_on_annot_axis( + annotations: Iterable[Dataset2D], + axis_label: Literal["obs_names", "var_names"], + join: Join_T, +): + from anndata.experimental.backed._compat import xr + from anndata.experimental.backed._xarray import Dataset2D + + annotations_with_only_dask = make_xarray_extension_dtypes_dask( + annotations, axis_label + ) + return Dataset2D(xr.concat(annotations_with_only_dask, join=join, dim=axis_label)) def concat( @@ -1143,6 +1158,8 @@ def concat( * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen at each from each position. * `"only"`: Elements that show up in only one of the objects. + + For :class:`xarray.Dataset` objects, we use their :func:`xarray.merge` with `override` to stay lazy. uns_merge How the elements of `.uns` are selected. Uses the same set of strategies as the `merge` argument, except applied recursively. @@ -1308,6 +1325,7 @@ def concat( {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ + from anndata.experimental.backed._compat import xr from anndata.experimental.backed._xarray import Dataset2D # Argument normalization @@ -1398,8 +1416,15 @@ def concat( ) alt_annot = merge_dataframes(alt_annotations, alt_indices, merge) else: - # TODO: figure out off-axis mapping - alt_annot = pd.DataFrame(index=alt_indices) + # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be + # the only "lazy" setting so I'm not sure we really want that. + axis_label = cast(Literal["obs_names", "var_names"], f"{alt_axis_name}_names") + annotations_with_only_dask = make_xarray_extension_dtypes_dask( + alt_annotations, axis_label + ) + alt_annot = Dataset2D( + xr.merge(annotations_with_only_dask, join=join, compat="override") + ) X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 9ee0c6542..b4728277b 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -166,12 +166,16 @@ def test_unconsolidated(tmp_path, mtx_format): @pytest.mark.parametrize("join", ["outer", "inner"]) -def test_concat_simple(tmp_path, join: Literal["outer", "inner"]): +@pytest.mark.parametrize("are_vars_different", [True, False]) +def test_concat_simple( + tmp_path, join: Literal["outer", "inner"], are_vars_different: bool +): from anndata.experimental.backed._compat import Dataset lazy_adatas = [] adatas = [] stores: list[AccessTrackingStore] = [] + var_indices = [] M = 1000 N = 50 n_datasets = 2 @@ -179,7 +183,11 @@ def test_concat_simple(tmp_path, join: Literal["outer", "inner"]): orig_path = tmp_path / f"orig_{dataset_index}.zarr" orig_path.mkdir() obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) - var_names = pd.Index(f"gene_{i}" for i in range(N)) + var_names = pd.Index( + f"gene_{i}{f'_{dataset_index}_ds' if are_vars_different and (i % 2) else ''}" + for i in range(N) + ) + var_indices.append(var_names) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) orig = AnnData( @@ -200,16 +208,36 @@ def test_concat_simple(tmp_path, join: Literal["outer", "inner"]): for i in range(n_datasets): stores[i].assert_access_count("obs/int64", 0) stores[i].assert_access_count("var/int64", 0) - df = ad.concat(adatas, join=join).obs + concatenated_memory = ad.concat(adatas, join=join) # account for differences # name is lost normally, should fix - df.index.name = "obs_names" + obs_memory = concatenated_memory.obs + obs_memory.index.name = "obs_names" # remote has object dtype, need to convert back for integers booleans etc. - remote_df = concated_remote.obs.to_pandas() - for col in df.columns: - dtype = df[col].dtype - if pd.api.types.is_extension_array_dtype(dtype): - remote_df[col] = remote_df[col].astype(dtype) - assert_equal(remote_df, df) + def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFrame): + for col in memory.columns: + dtype = memory[col].dtype + if pd.api.types.is_extension_array_dtype(dtype): + remote[col] = remote[col].astype(dtype) + return remote, memory + + assert_equal( + *correct_extension_dtype_differences( + concated_remote.obs.to_pandas(), obs_memory + ) + ) + # check non-different variables, taken from first annotation. all others are null so incomparable + pd_index = pd.Index(filter(lambda x: not x.endswith("ds"), var_indices[0])) + var_df = adatas[0][:, pd_index].var.copy() + var_df.index.name = "var_names" + remote_df_corrected, _ = correct_extension_dtype_differences( + concated_remote[:, pd_index].var.to_pandas(), var_df + ) + # TODO:xr.merge always upcasts to float due to NA and you can't downcast? + for col in remote_df_corrected.columns: + dtype = remote_df_corrected[col].dtype + if dtype in [np.float64, np.float32]: + var_df[col] = var_df[col].astype(dtype) + assert_equal(remote_df_corrected, var_df) From a7cbbd88b5c2a113a50c4098b2b7869424377659 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 15:46:16 +0200 Subject: [PATCH 248/348] (fix): notebook --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9e186c5c6..1f572cf23 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 +Subproject commit 1f572cf230a4cd88e0abab7c990a5af6a0fb52af From af4520a7ba002071d90e5d1be5cbba4160bc1461 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 16:13:23 +0200 Subject: [PATCH 249/348] (fix): NaS in sparse dask --- src/anndata/_core/merge.py | 9 ++++++++- tests/test_concatenate.py | 23 +++++++++++++++++++++++ tests/test_read_backed_experimental.py | 2 ++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 315b8687e..421e781fb 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -727,7 +727,14 @@ def default_fill_value(els): This is largely due to backwards compat, and might not be the ideal solution. """ - if any(isinstance(el, (sparse.spmatrix, SpArray)) for el in els): + if any( + isinstance(el, (sparse.spmatrix, SpArray)) + or ( + isinstance(el, DaskArray) + and isinstance(el._meta, (sparse.spmatrix, SpArray)) + ) + for el in els + ): return 0 else: return np.nan diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 08fe2cd44..f1f585e5c 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -1533,6 +1533,29 @@ def test_concat_different_types_dask(merge_strategy, array_type): assert_equal(result2, target2) +def test_concat_dask_sparse_matches_memory(join_type, merge_strategy): + import dask.array as da + from scipy import sparse + + import anndata as ad + + X = sparse.random(50, 20, density=0.5, format="csr") + X_dask = da.from_array(X, chunks=(5, 20)) + var_names_1 = [f"gene_{i}" for i in range(20)] + var_names_2 = [f"gene_{i}{'_foo' if (i%2) else ''}" for i in range(20, 40)] + + ad1 = ad.AnnData(X=X, var=pd.DataFrame(index=var_names_1)) + ad2 = ad.AnnData(X=X, var=pd.DataFrame(index=var_names_2)) + + ad1_dask = ad.AnnData(X=X_dask, var=pd.DataFrame(index=var_names_1)) + ad2_dask = ad.AnnData(X=X_dask, var=pd.DataFrame(index=var_names_2)) + + res_in_memory = ad.concat([ad1, ad2], join=join_type, merge=merge_strategy) + res_dask = ad.concat([ad1_dask, ad2_dask], join=join_type, merge=merge_strategy) + + assert_equal(res_in_memory, res_dask) + + def test_outer_concat_with_missing_value_for_df(): # https://github.com/scverse/anndata/issues/901 # TODO: Extend this test to cover all cases of missing values diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index b4728277b..f95baea15 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -241,3 +241,5 @@ def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFra if dtype in [np.float64, np.float32]: var_df[col] = var_df[col].astype(dtype) assert_equal(remote_df_corrected, var_df) + + assert_equal(concated_remote.X, concatenated_memory.X) From fa7358ff86f96ee44fc6f8782883efe16f2870f0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 16:14:04 +0200 Subject: [PATCH 250/348] (chore): add `X` tracker --- tests/test_read_backed_experimental.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index f95baea15..335c1d1c3 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -199,7 +199,7 @@ def test_concat_simple( ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) - store.initialize_key_trackers(["obs/int64", "var/int64"]) + store.initialize_key_trackers(["obs/int64", "var/int64", "X"]) lazy_adatas += [read_lazy(store)] adatas += [orig] stores += [store] @@ -207,6 +207,7 @@ def test_concat_simple( assert isinstance(concated_remote.obs, Dataset) for i in range(n_datasets): stores[i].assert_access_count("obs/int64", 0) + stores[i].assert_access_count("X", 0) stores[i].assert_access_count("var/int64", 0) concatenated_memory = ad.concat(adatas, join=join) # account for differences From 4db8c1b9a45981765c2ac0a925d8a814dc92b145 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 16:15:57 +0200 Subject: [PATCH 251/348] (chore): add more robust matrix type tests --- tests/test_read_backed_experimental.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 335c1d1c3..5c41933b5 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -168,7 +168,7 @@ def test_unconsolidated(tmp_path, mtx_format): @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize("are_vars_different", [True, False]) def test_concat_simple( - tmp_path, join: Literal["outer", "inner"], are_vars_different: bool + tmp_path, join: Literal["outer", "inner"], are_vars_different: bool, mtx_format ): from anndata.experimental.backed._compat import Dataset @@ -193,9 +193,7 @@ def test_concat_simple( orig = AnnData( obs=obs, var=var, - X=sparse.csr_matrix( - np.random.binomial(100, 0.005, (M, N)).astype(np.float32) - ), + X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) From eefbee62aafe098d9a68c6c3b629c2a6b17e6c22 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 30 Sep 2024 16:23:23 +0200 Subject: [PATCH 252/348] (fix): ok now notebooks? --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 1f572cf23..37efc5077 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 1f572cf230a4cd88e0abab7c990a5af6a0fb52af +Subproject commit 37efc507729b91f9969cb955042183e6d4d1eef2 From 39f28381594cc0f6d5422dd3cc94403882bc06c9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 11:31:36 +0200 Subject: [PATCH 253/348] (feat): fix additional index load --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 24100f657..809fcc238 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -210,7 +210,7 @@ def _gen_xarray_dict_iterator_from_elems( ) elif k == index_key: data_array = xr.DataArray( - v, coords=[v], dims=[index_label], name=index_label + index, coords=[index], dims=[index_label], name=index_label ) data_array_name = index_label else: From dcca71136e5ce56b85ea3e2dd3ee2baa9b255b5c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 11:32:18 +0200 Subject: [PATCH 254/348] (feat): no-load index --- src/anndata/_io/specs/lazy_methods.py | 26 ++++++++++++++++++---- src/anndata/_io/specs/registry.py | 20 ++++++++++++++--- src/anndata/experimental/backed/_io.py | 6 +++++ src/anndata/experimental/backed/_xarray.py | 2 +- src/anndata/tests/helpers.py | 6 ++++- tests/test_read_backed_experimental.py | 15 ++++++++++++- 6 files changed, 65 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 809fcc238..f90fd96b2 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -7,6 +7,7 @@ import h5py import numpy as np +import pandas as pd from scipy import sparse import anndata as ad @@ -185,6 +186,9 @@ def read_zarr_array( return da.from_zarr(elem, chunks=chunks) +DUMMY_RANGE_INDEX_KEY = "_anndata_dummy_range_index" + + def _gen_xarray_dict_iterator_from_elems( elem_dict: dict[str, LazyDataStructures], index_label: str, @@ -216,6 +220,11 @@ def _gen_xarray_dict_iterator_from_elems( else: raise ValueError(f"Could not read {k}: {v} from into xarray Dataset2D") yield data_array_name, data_array + if index_key == DUMMY_RANGE_INDEX_KEY: + yield ( + index_label, + xr.DataArray(index, coords=[index], dims=[index_label], name=index_label), + ) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) @@ -224,6 +233,7 @@ def read_dataframe( elem: H5Group | ZarrGroup, *, _reader: LazyReader, + use_range_index: bool = False, ) -> Dataset2D: from anndata.experimental.backed._xarray import Dataset2D @@ -232,13 +242,21 @@ def read_dataframe( for k in [*elem.attrs["column-order"], elem.attrs["_index"]] } elem_name = get_elem_name(elem) - index_label = f'{elem_name.replace("/", "")}_names' - index_key = elem.attrs["_index"] - index = elem_dict[index_key].compute() # no sense in reading this in multiple times + label_based_indexing_key = f'{elem_name.replace("/", "")}_names' + if not use_range_index: + index_label = label_based_indexing_key + index_key = elem.attrs["_index"] + index = elem_dict[ + index_key + ].compute() # no sense in reading this in multiple times + else: + index_label = DUMMY_RANGE_INDEX_KEY + index_key = DUMMY_RANGE_INDEX_KEY + index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])) elem_xarray_dict = dict( _gen_xarray_dict_iterator_from_elems(elem_dict, index_label, index_key, index) ) - return Dataset2D(elem_xarray_dict) + return Dataset2D(elem_xarray_dict, attrs={"indexing_key": label_based_indexing_key}) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 14e9511a8..9b3001066 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -293,6 +293,7 @@ def read_elem( elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, + **kwargs, ) -> LazyDataStructures: """Read a dask element from a store. See exported function for more details.""" @@ -303,8 +304,21 @@ def read_elem( if self.callback is not None: msg = "Dask reading does not use a callback. Ignoring callback." warnings.warn(msg, stacklevel=2) + read_params = inspect.signature(read_func).parameters + has_extra_args = False + for kwarg in kwargs: + if kwarg not in read_params: + msg = ( + f"Keyword argument {kwarg} passed to read_elem_lazy are not supported by the " + "registered read function." + ) + raise ValueError(msg) + has_extra_args = True if "chunks" in inspect.signature(read_func).parameters: - return read_func(elem, chunks=chunks) + has_extra_args = True + kwargs["chunks"] = chunks + if has_extra_args: + return read_func(elem, **kwargs) return read_func(elem) @@ -385,7 +399,7 @@ def read_elem(elem: StorageType) -> RWAble: def read_elem_lazy( - elem: StorageType, chunks: tuple[int, ...] | None = None + elem: StorageType, chunks: tuple[int, ...] | None = None, **kwargs ) -> LazyDataStructures: """ Read an element from a store lazily. @@ -406,7 +420,7 @@ def read_elem_lazy( ------- DaskArray """ - return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) + return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks, **kwargs) def write_elem( diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index b339290e3..610cd2a26 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -21,6 +21,7 @@ def read_lazy( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, + use_range_index: bool = False, ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. @@ -30,6 +31,9 @@ def read_lazy( ---------- store A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. + use_range_index + Whether or not to use a range index for the :class:`xr.Dataset` so as not to load the `index` into memory. + If `True`, the real `index` will be loaded as `{obs,var}_names` in the object but not be one of the `coords`. Returns ------- @@ -89,6 +93,8 @@ def callback(func, elem_name: str, elem, iospec): } or "nullable" in iospec.encoding_type ): + if "dataframe" == iospec.encoding_type: + return read_elem_lazy(elem, use_range_index=use_range_index) return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index b2f68f9d3..8c1edde56 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -86,7 +86,7 @@ def columns(self) -> pd.Index: @_subset.register(Dataset2D) def _(a: Dataset2D, subset_idx: Index): - key = get_index_dim(a) + key = a.attrs["indexing_key"] # xarray seems to have some code looking for a second entry in tuples if isinstance(subset_idx, tuple) and len(subset_idx) == 1: subset_idx = subset_idx[0] diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index dce089a65..2ae0fdd17 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1092,7 +1092,11 @@ def reset_key_trackers(self): self.initialize_key_trackers(self._access_count.keys()) def assert_access_count(self, key: str, count: int): - assert self.get_access_count(key) == count, self.get_subkeys_accessed(key) + keys_accessed = self.get_subkeys_accessed(key) + access_count = self.get_access_count(key) + assert ( + self.get_access_count(key) == count + ), f"Found {access_count} accesses at {keys_accessed}" def get_multiindex_columns_df(shape: tuple[int, int]) -> pd.DataFrame: diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 5c41933b5..98e7b82e3 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -49,7 +49,9 @@ def adata_remote_orig( @pytest.fixture -def adata_remote_with_store_tall_skinny(tmp_path_factory, mtx_format): +def adata_remote_with_store_tall_skinny( + tmp_path_factory, mtx_format +) -> tuple[AnnData, AccessTrackingStore]: orig_path = tmp_path_factory.mktemp("orig.zarr") M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 @@ -104,6 +106,17 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): store.assert_access_count("var/int64", 0) +def test_access_count_index(adata_remote_with_store_tall_skinny): + _, store = adata_remote_with_store_tall_skinny + store.reset_key_trackers() + store.initialize_key_trackers(["obs/_index"]) + read_lazy(store, use_range_index=True) + store.assert_access_count("obs/_index", 0) + read_lazy(store) + # 16 is number of chunks + store.assert_access_count("obs/_index", 16) + + def test_access_count_dtype(adata_remote_with_store_tall_skinny): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/cat/categories"]) From d3121b7289dc95658997aaa5fa4fedeaef59ff1e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:04:56 +0200 Subject: [PATCH 255/348] (fix): only use range indices for `{obs,var}` --- src/anndata/experimental/backed/_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 610cd2a26..682f9e7d5 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -32,7 +32,7 @@ def read_lazy( store A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. use_range_index - Whether or not to use a range index for the :class:`xr.Dataset` so as not to load the `index` into memory. + Whether or not to use a range index for the `{obs,var}` :class:`xr.Dataset` so as not to load the `index` into memory. If `True`, the real `index` will be loaded as `{obs,var}_names` in the object but not be one of the `coords`. Returns @@ -93,7 +93,7 @@ def callback(func, elem_name: str, elem, iospec): } or "nullable" in iospec.encoding_type ): - if "dataframe" == iospec.encoding_type: + if "dataframe" == iospec.encoding_type and elem_name in {"obs", "var"}: return read_elem_lazy(elem, use_range_index=use_range_index) return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: From b25e8bab6f940de99c82f37feb01ee642507d2c3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:18:24 +0200 Subject: [PATCH 256/348] (chore): add range index testing --- tests/test_read_backed_experimental.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 98e7b82e3..9c7c4cd72 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -37,15 +37,20 @@ def dskfmt(request): return request.param +@pytest.fixture(params=[True, False], scope="session") +def use_range_index(request): + return request.param + + @pytest.fixture(scope="session") def adata_remote_orig( - tmp_path_factory, dskfmt: str, mtx_format + tmp_path_factory, dskfmt: str, mtx_format, use_range_index: bool ) -> tuple[AnnData, AnnData]: + """Create remote fixtures, one without a range index and the other with""" orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_path) - remote = read_lazy(orig_path) - return remote, orig + return read_lazy(orig_path, use_range_index=use_range_index), orig @pytest.fixture From 870a4f28784ac818ec03226fc53b96f31c795291 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:18:57 +0200 Subject: [PATCH 257/348] (fix): ensure `{obs,var}_names` always exists on dataset --- src/anndata/_io/specs/lazy_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f90fd96b2..39cfca216 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -256,7 +256,10 @@ def read_dataframe( elem_xarray_dict = dict( _gen_xarray_dict_iterator_from_elems(elem_dict, index_label, index_key, index) ) - return Dataset2D(elem_xarray_dict, attrs={"indexing_key": label_based_indexing_key}) + ds = Dataset2D(elem_xarray_dict, attrs={"indexing_key": label_based_indexing_key}) + if use_range_index: + return ds.rename_vars({elem.attrs["_index"]: label_based_indexing_key}) + return ds @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) From c2681e013a05754badfaf84685fcd7edacc4329d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:19:22 +0200 Subject: [PATCH 258/348] (fix): only use `use_range_index` on `{obs,var}` --- src/anndata/experimental/backed/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 682f9e7d5..1bc588c21 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -93,7 +93,7 @@ def callback(func, elem_name: str, elem, iospec): } or "nullable" in iospec.encoding_type ): - if "dataframe" == iospec.encoding_type and elem_name in {"obs", "var"}: + if "dataframe" == iospec.encoding_type and elem_name in {"/obs", "/var"}: return read_elem_lazy(elem, use_range_index=use_range_index) return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: From e7a915b38752234a451a86b0a6e0e7dffc93a983 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:32:34 +0200 Subject: [PATCH 259/348] (fix): don't check uniqueness to prevent index load + rename variable --- src/anndata/experimental/backed/_io.py | 14 ++++++++------ tests/test_read_backed_experimental.py | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 1bc588c21..427ecf77b 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -11,6 +11,7 @@ from anndata._io.specs.registry import read_elem_lazy from ..._core.anndata import AnnData +from ..._settings import settings from .. import read_dispatched if TYPE_CHECKING: @@ -21,7 +22,7 @@ def read_lazy( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, - use_range_index: bool = False, + load_annotation_index: bool = True, ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. @@ -31,9 +32,9 @@ def read_lazy( ---------- store A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. - use_range_index - Whether or not to use a range index for the `{obs,var}` :class:`xr.Dataset` so as not to load the `index` into memory. - If `True`, the real `index` will be loaded as `{obs,var}_names` in the object but not be one of the `coords`. + load_annotation_index + Whether or not to use a range index for the `{obs,var}` :class:`xr.Dataset` so as not to load the index into memory. + If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations. Returns ------- @@ -94,12 +95,13 @@ def callback(func, elem_name: str, elem, iospec): or "nullable" in iospec.encoding_type ): if "dataframe" == iospec.encoding_type and elem_name in {"/obs", "/var"}: - return read_elem_lazy(elem, use_range_index=use_range_index) + return read_elem_lazy(elem, use_range_index=not load_annotation_index) return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) return func(elem) - adata = read_dispatched(f, callback=callback) + with settings.override(check_uniqueness=load_annotation_index): + adata = read_dispatched(f, callback=callback) return adata diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index 9c7c4cd72..bf707ef06 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -38,19 +38,19 @@ def dskfmt(request): @pytest.fixture(params=[True, False], scope="session") -def use_range_index(request): +def load_annotation_index(request): return request.param @pytest.fixture(scope="session") def adata_remote_orig( - tmp_path_factory, dskfmt: str, mtx_format, use_range_index: bool + tmp_path_factory, dskfmt: str, mtx_format, load_annotation_index: bool ) -> tuple[AnnData, AnnData]: """Create remote fixtures, one without a range index and the other with""" orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_path) - return read_lazy(orig_path, use_range_index=use_range_index), orig + return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig @pytest.fixture @@ -115,7 +115,7 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): _, store = adata_remote_with_store_tall_skinny store.reset_key_trackers() store.initialize_key_trackers(["obs/_index"]) - read_lazy(store, use_range_index=True) + read_lazy(store, load_annotation_index=False) store.assert_access_count("obs/_index", 0) read_lazy(store) # 16 is number of chunks From f93499def1dad40f814d9ef99437c27b93e24989 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 12:32:50 +0200 Subject: [PATCH 260/348] (fix): check for presence of indexing key --- src/anndata/experimental/backed/_xarray.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 8c1edde56..17dfafb75 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -34,6 +34,9 @@ def index(self) -> pd.Index: ------- The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. """ + if "indexing_key" in self.attrs: + key = self.attrs["indexing_key"] + return pd.Index(self[key].data) coord = list(self.coords.keys())[0] return pd.Index(self.coords[coord].data) @@ -120,5 +123,9 @@ def _remove_unused_categories_xr( @to_memory.register(Dataset2D) def to_memory(ds: Dataset2D, copy=False): df = ds.to_dataframe() + if "indexing_key" in ds.attrs: + index_key = ds.attrs["indexing_key"] + if df.index.name != index_key: + df.set_index(index_key, inplace=True) df.index.name = None # matches old AnnData object return df From 63a4515c72cb90c6f1daf21a64b7d498a7a12f8d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 13:00:36 +0200 Subject: [PATCH 261/348] always return `index` object --- src/anndata/experimental/backed/_xarray.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 17dfafb75..a42ca59b2 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -22,7 +22,7 @@ def get_index_dim(ds: DataArray) -> Hashable: assert ( len(ds.sizes) == 1 ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" - return list(ds.sizes.keys())[0] + return list(ds.indexes.keys())[0] class Dataset2D(Dataset): @@ -34,15 +34,12 @@ def index(self) -> pd.Index: ------- The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. """ - if "indexing_key" in self.attrs: - key = self.attrs["indexing_key"] - return pd.Index(self[key].data) - coord = list(self.coords.keys())[0] - return pd.Index(self.coords[coord].data) + coord = get_index_dim(self) + return self.indexes[coord] @index.setter def index(self, val) -> None: - coord = list(self.coords.keys())[0] + coord = get_index_dim(self) self.coords[coord] = val @property From b1c8c22b6528873c64ce6e2740a7ee6350742f8d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 13:01:55 +0200 Subject: [PATCH 262/348] (fix): remove unnecessary check? --- src/anndata/_core/index.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index b17d8f1d7..72a075456 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -52,10 +52,11 @@ def _normalize_index( ) -> slice | int | np.ndarray: # ndarray of int or bool from ..experimental.backed._compat import xr - if not isinstance(index, pd.RangeIndex): - msg = "Don’t call _normalize_index with non-categorical/string names" - assert index.dtype != float, msg - assert index.dtype != int, msg + # TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough. + # if not isinstance(index, pd.RangeIndex): + # msg = "Don’t call _normalize_index with non-categorical/string names and non-range index" + # assert index.dtype != float, msg + # assert index.dtype != int, msg # the following is insanely slow for sequences, # we replaced it using pandas below From d082a353e1790cb37f3241a23d27202a27eeffc0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 13:04:57 +0200 Subject: [PATCH 263/348] (chore): update notebook --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 37efc5077..9b5990755 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 37efc507729b91f9969cb955042183e6d4d1eef2 +Subproject commit 9b59907550a25af615424a9f6a645bc034b82eeb From e81d155d0b1faa5ee90baf29154ed48c5aa6a042 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 13:58:27 +0200 Subject: [PATCH 264/348] (fix): docstring class --- src/anndata/experimental/backed/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 427ecf77b..018e9a5c5 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -33,7 +33,7 @@ def read_lazy( store A store-like object to be read in. If :class:`zarr.hierarchy.Group`, it is best for it to be consolidated. load_annotation_index - Whether or not to use a range index for the `{obs,var}` :class:`xr.Dataset` so as not to load the index into memory. + Whether or not to use a range index for the `{obs,var}` :class:`xarray.Dataset` so as not to load the index into memory. If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations. Returns From c4d014659993c413b4c173e6c70c27a335492ff5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 14:25:03 +0200 Subject: [PATCH 265/348] (fix): explicit 1d chunking for `concat` --- src/anndata/_core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 421e781fb..f1fa27d13 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1083,7 +1083,7 @@ def get_chunk(block_info=None): # TODO: get good chunk size? return da.map_blocks( get_chunk, - chunks=compute_chunk_layout_for_axis_size(1000, col.shape[0]), + chunks=(compute_chunk_layout_for_axis_size(1000, col.shape[0]),), meta=np.array([], dtype=dtype), ) From b34ac0ab9718c60bacb29e7ad19bb1b1feaa96a8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 14:33:10 +0200 Subject: [PATCH 266/348] (chore): change `concat` test name --- tests/test_read_backed_experimental.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_backed_experimental.py index bf707ef06..65470295e 100644 --- a/tests/test_read_backed_experimental.py +++ b/tests/test_read_backed_experimental.py @@ -185,7 +185,7 @@ def test_unconsolidated(tmp_path, mtx_format): @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize("are_vars_different", [True, False]) -def test_concat_simple( +def test_concat( tmp_path, join: Literal["outer", "inner"], are_vars_different: bool, mtx_format ): from anndata.experimental.backed._compat import Dataset From 1f4ab9241a3690e99f7af0123422c224946b272e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 14:34:03 +0200 Subject: [PATCH 267/348] (chore): rename test file --- tests/{test_read_backed_experimental.py => test_read_lazy.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_read_backed_experimental.py => test_read_lazy.py} (100%) diff --git a/tests/test_read_backed_experimental.py b/tests/test_read_lazy.py similarity index 100% rename from tests/test_read_backed_experimental.py rename to tests/test_read_lazy.py From d25f5593214604c64c690e6ca69988a2893427da Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 14:34:39 +0200 Subject: [PATCH 268/348] (chore): rename notebook --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9b5990755..0af6cf336 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9b59907550a25af615424a9f6a645bc034b82eeb +Subproject commit 0af6cf3363aed1cafd317516c8393136ee6287ae From c661d390109b4bc28aebf29335eb48718d62d706 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 14:37:47 +0200 Subject: [PATCH 269/348] (chore): clarify `load_annotation_index` --- src/anndata/experimental/backed/_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 018e9a5c5..9aaf1ae4e 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -35,6 +35,7 @@ def read_lazy( load_annotation_index Whether or not to use a range index for the `{obs,var}` :class:`xarray.Dataset` so as not to load the index into memory. If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations. + Access to `adata.obs.index` will also only give the dummy index, and not the "real" index that is file-backed. Returns ------- From bff63cb9bb03172b744d91de7354c1e45ae464b9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 16:27:55 +0200 Subject: [PATCH 270/348] (fix): allow concatenation along arbitrary index + indexing test --- src/anndata/_core/merge.py | 45 ++++++++++++-------- tests/test_read_lazy.py | 84 ++++++++++++++++++++++++++++++++------ 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index f1fa27d13..98e413eef 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -5,12 +5,13 @@ from __future__ import annotations import typing +import warnings from collections import OrderedDict from collections.abc import Callable, Mapping, MutableSet from functools import partial, reduce, singledispatch from itertools import repeat from operator import and_, or_, sub -from typing import Literal, TypeVar, cast +from typing import Literal, TypeVar from warnings import warn import numpy as np @@ -1090,16 +1091,14 @@ def get_chunk(block_info=None): def make_xarray_extension_dtypes_dask( annotations: Iterable[Dataset2D], - axis_label: Literal["obs_names", "var_names"], ): new_annotations = [] for a in annotations: extension_cols = [] for col in a.columns: - if col != axis_label: - if pd.api.types.is_extension_array_dtype(a[col]): - extension_cols += [col] + if pd.api.types.is_extension_array_dtype(a[col]): + extension_cols += [col] new_annotations += [ a.copy( data={ @@ -1114,18 +1113,34 @@ def make_xarray_extension_dtypes_dask( return new_annotations +def get_attrs(annotations: Iterable[Dataset2D]) -> dict: + index_names = np.unique([a.index.name for a in annotations]) + assert len(index_names) == 1, "All annotations must have the same index name." + if any(a.index.dtype == "int64" for a in annotations): + msg = "Concatenating with a pandas numeric index among the indices. Index may likely not be unique." + warnings.warn(msg, UserWarning) + index_keys = [ + a.attrs["indexing_key"] for a in annotations if "indexing_key" in a.attrs + ] + attrs = {} + if len(np.unique(index_keys)) == 1: + attrs["indexing_key"] = index_keys[0] + return attrs + + def concat_dataset2d_on_annot_axis( annotations: Iterable[Dataset2D], - axis_label: Literal["obs_names", "var_names"], join: Join_T, ): from anndata.experimental.backed._compat import xr from anndata.experimental.backed._xarray import Dataset2D - annotations_with_only_dask = make_xarray_extension_dtypes_dask( - annotations, axis_label + annotations_with_only_dask = make_xarray_extension_dtypes_dask(annotations) + attrs = get_attrs(annotations_with_only_dask) + index_name = np.unique([a.index.name for a in annotations])[0] + return Dataset2D( + xr.concat(annotations_with_only_dask, join=join, dim=index_name), attrs=attrs ) - return Dataset2D(xr.concat(annotations_with_only_dask, join=join, dim=axis_label)) def concat( @@ -1400,8 +1415,7 @@ def concat( ignore_index=True, ) else: - axis_label = cast(Literal["obs_names", "var_names"], f"{axis_name}_names") - concat_annot = concat_dataset2d_on_annot_axis(annotations, axis_label, join) + concat_annot = concat_dataset2d_on_annot_axis(annotations, join) concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col @@ -1425,12 +1439,11 @@ def concat( else: # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be # the only "lazy" setting so I'm not sure we really want that. - axis_label = cast(Literal["obs_names", "var_names"], f"{alt_axis_name}_names") - annotations_with_only_dask = make_xarray_extension_dtypes_dask( - alt_annotations, axis_label - ) + annotations_with_only_dask = make_xarray_extension_dtypes_dask(alt_annotations) + attrs = get_attrs(annotations_with_only_dask) alt_annot = Dataset2D( - xr.merge(annotations_with_only_dask, join=join, compat="override") + xr.merge(annotations_with_only_dask, join=join, compat="override"), + attrs=attrs, ) X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 65470295e..ce86c84f5 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -42,6 +42,11 @@ def load_annotation_index(request): return request.param +@pytest.fixture(params=["outer", "inner"], scope="session") +def join(request): + return request.param + + @pytest.fixture(scope="session") def adata_remote_orig( tmp_path_factory, dskfmt: str, mtx_format, load_annotation_index: bool @@ -58,7 +63,7 @@ def adata_remote_with_store_tall_skinny( tmp_path_factory, mtx_format ) -> tuple[AnnData, AccessTrackingStore]: orig_path = tmp_path_factory.mktemp("orig.zarr") - M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access + M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) var_names = pd.Index(f"gene{i}" for i in range(N)) @@ -104,8 +109,9 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): store.assert_access_count("obs/int64", 0) store.assert_access_count("var/int64", 0) # all codes read in for subset (from 4 chunks) - store.assert_access_count("obs/cat/codes", 4) - remote[0:10, :].obs["int64"][0:10].compute() + store.assert_access_count("obs/cat/codes", 1) + # only one chunk needed for 0:10 subset + remote[0:10, :].obs["int64"].compute() store.assert_access_count("obs/int64", 1) # .zmetadata handles .zarray so simple access does not cause any read store.assert_access_count("var/int64", 0) @@ -118,8 +124,8 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): read_lazy(store, load_annotation_index=False) store.assert_access_count("obs/_index", 0) read_lazy(store) - # 16 is number of chunks - store.assert_access_count("obs/_index", 16) + # 8 is number of chunks + store.assert_access_count("obs/_index", 4) def test_access_count_dtype(adata_remote_with_store_tall_skinny): @@ -183,6 +189,15 @@ def test_unconsolidated(tmp_path, mtx_format): store.assert_access_count("obs/.zgroup", 1) +# remote has object dtype, need to convert back for integers booleans etc. +def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFrame): + for col in memory.columns: + dtype = memory[col].dtype + if pd.api.types.is_extension_array_dtype(dtype): + remote[col] = remote[col].astype(dtype) + return remote, memory + + @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize("are_vars_different", [True, False]) def test_concat( @@ -232,14 +247,6 @@ def test_concat( obs_memory = concatenated_memory.obs obs_memory.index.name = "obs_names" - # remote has object dtype, need to convert back for integers booleans etc. - def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFrame): - for col in memory.columns: - dtype = memory[col].dtype - if pd.api.types.is_extension_array_dtype(dtype): - remote[col] = remote[col].astype(dtype) - return remote, memory - assert_equal( *correct_extension_dtype_differences( concated_remote.obs.to_pandas(), obs_memory @@ -260,3 +267,54 @@ def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFra assert_equal(remote_df_corrected, var_df) assert_equal(concated_remote.X, concatenated_memory.X) + + +def test_concat_without_annotation_index(adata_remote_with_store_tall_skinny, join): + _, store = adata_remote_with_store_tall_skinny + remote = read_lazy(store, load_annotation_index=False) + orig = remote.to_memory() + with pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric"): + remote_concatenated = ad.concat([remote, remote], join=join) + orig_concatenated = ad.concat([orig, orig], join=join) + in_memory_remote_concatenated = remote_concatenated.to_memory() + corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( + in_memory_remote_concatenated.obs, orig_concatenated.obs + ) + assert_equal(corrected_remote_obs, corrected_memory_obs) + assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) + assert all(in_memory_remote_concatenated.var_names == orig_concatenated.var_names) + + +@pytest.mark.parametrize( + "index", + [ + pytest.param( + slice(95_000, 105_000), + id="slice", + ), + pytest.param( + np.arange(95_000, 105_000), + id="consecutive integer array", + ), + pytest.param( + np.random.choice(np.arange(80_000, 110_000), 500), + id="random integer array", + ), + pytest.param( + np.random.choice([True, False], 200_000), + id="boolean array", + ), + ], +) +def test_concat_subsets(adata_remote_with_store_tall_skinny, join, index): + remote, _ = adata_remote_with_store_tall_skinny + orig = remote.to_memory() + remote_concatenated = ad.concat([remote, remote], join=join)[index] + orig_concatenated = ad.concat([orig, orig], join=join)[index] + in_memory_remote_concatenated = remote_concatenated.to_memory() + corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( + in_memory_remote_concatenated.obs, orig_concatenated.obs + ) + assert_equal(corrected_remote_obs, corrected_memory_obs) + assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) + assert all(in_memory_remote_concatenated.var_names == orig_concatenated.var_names) From 784ea9b05b19494ca72369eb79edd656d87fff89 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Oct 2024 16:41:05 +0200 Subject: [PATCH 271/348] (fix): notebook path in docs --- docs/tutorials/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 9fb5f3283..29bc1f1fc 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -14,5 +14,5 @@ notebooks/anncollection-annloader notebooks/anndata_dask_array notebooks/awkward-arrays notebooks/{read,write}_dispatched -notebooks/read_backed_experimental +notebooks/read_lazy ``` From 489cc8df0a81dbe98271b4ef1bb2d7744793f2a9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 14:05:55 +0200 Subject: [PATCH 272/348] (fix): don't copy column in dataset concat --- src/anndata/_core/merge.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 98e413eef..119866ca9 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1071,13 +1071,11 @@ def make_dask_col_from_extension_dtype(col): from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size - new_col = col.copy() - def get_chunk(block_info=None): idx = tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) - return np.array(new_col.data[idx].array) + return np.array(col.data[idx].array) # TODO: fix dtype dtype = "object" From 24f11be5daf0bd4043d6553d24ca4b9d345aa69d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 14:11:40 +0200 Subject: [PATCH 273/348] (fix): actually test h5 --- tests/test_read_lazy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index ce86c84f5..0b5634777 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -52,9 +52,12 @@ def adata_remote_orig( tmp_path_factory, dskfmt: str, mtx_format, load_annotation_index: bool ) -> tuple[AnnData, AnnData]: """Create remote fixtures, one without a range index and the other with""" - orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") + if dskfmt == "h5ad": + orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / f"orig.{dskfmt}" + else: + orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1000), mtx_format) - orig.write_zarr(orig_path) + getattr(orig, f"write_{dskfmt}")(orig_path) return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig From 57bcfd9aebb8d8f62f62791df4f82bea32385a65 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 14:25:32 +0200 Subject: [PATCH 274/348] (feat): add h5ad concat support --- src/anndata/_core/merge.py | 8 ++++++++ tests/test_read_lazy.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 119866ca9..93d4b7793 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -771,10 +771,18 @@ def np_bool_to_pd_bool_array(df: pd.DataFrame): # TODO: concat for xarray def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): + from anndata.experimental.backed._compat import Dataset + arrays = list(arrays) if fill_value is None: fill_value = default_fill_value(arrays) + if any(isinstance(a, Dataset) for a in arrays): + if not all(isinstance(a, Dataset) for a in arrays): + raise NotImplementedError( + "Cannot concatenate a Dataset with other array types." + ) + return concat_dataset2d_on_annot_axis(arrays, join="outer") if any(isinstance(a, pd.DataFrame) for a in arrays): # TODO: This is hacky, 0 is a sentinel for outer_concat_aligned_mapping if not all( diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 0b5634777..ee396cede 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -56,7 +56,7 @@ def adata_remote_orig( orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / f"orig.{dskfmt}" else: orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") - orig = gen_adata((1000, 1000), mtx_format) + orig = gen_adata((1000, 1100), mtx_format) getattr(orig, f"write_{dskfmt}")(orig_path) return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig @@ -292,26 +292,26 @@ def test_concat_without_annotation_index(adata_remote_with_store_tall_skinny, jo "index", [ pytest.param( - slice(95_000, 105_000), + slice(500, 1500), id="slice", ), pytest.param( - np.arange(95_000, 105_000), + np.arange(950, 1050), id="consecutive integer array", ), pytest.param( - np.random.choice(np.arange(80_000, 110_000), 500), + np.random.choice(np.arange(800, 1100), 500), id="random integer array", ), pytest.param( - np.random.choice([True, False], 200_000), + np.random.choice([True, False], 2000), id="boolean array", ), + pytest.param(slice(None), id="full"), ], ) -def test_concat_subsets(adata_remote_with_store_tall_skinny, join, index): - remote, _ = adata_remote_with_store_tall_skinny - orig = remote.to_memory() +def test_concat_subsets(adata_remote_orig, join, index): + remote, orig = adata_remote_orig remote_concatenated = ad.concat([remote, remote], join=join)[index] orig_concatenated = ad.concat([orig, orig], join=join)[index] in_memory_remote_concatenated = remote_concatenated.to_memory() From 8b07f43c29d83447c210195f41b4ca2f0b12d48d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 14:52:43 +0200 Subject: [PATCH 275/348] (fix): add docsting example --- src/anndata/experimental/backed/_io.py | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 9aaf1ae4e..eaba44d93 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -40,6 +40,53 @@ def read_lazy( Returns ------- A lazily read-in :class:`~anndata.AnnData` object. + + Examples + -------- + + Preparing example objects + + >>> import anndata as ad + >>> import httpx + >>> import scanpy as sc + >>> base_url = "https://datasets.cellxgene.cziscience.com" + >>> def get_cellxgene_data(id_: str): + ... out_path = sc.settings.datasetdir / f"{id_}.h5ad" + ... if out_path.exists(): + ... return out_path + ... file_url = f"{base_url}/{id_}.h5ad" + ... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True) + ... with httpx.stream("GET", file_url) as r, out_path.open("wb") as f: + ... r.raise_for_status() + ... for data in r.iter_bytes(): + ... f.write(data) + ... return out_path + >>> path_b_cells = get_cellxgene_data("a93eab58-3d82-4b61-8a2f-d7666dcdb7c4") + >>> path_fetal = get_cellxgene_data("d170ff04-6da0-4156-a719-f8e1bbefbf53") + >>> b_cells_adata = ad.experimental.read_lazy(path_b_cells) + >>> fetal_adata = ad.experimental.read_lazy(path_fetal) + >>> print(b_cells_adata) + AnnData object with n_obs × n_vars = 146 × 33452 + obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Phase', 'sample', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage' + var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype' + uns: 'default_embedding', 'schema_version', 'title' + obsm: 'X_harmony', 'X_pca', 'X_umap' + >>> print(fetal_adata) + AnnData object with n_obs × n_vars = 344 × 15585 + obs: 'nCount_Spatial', 'nFeature_Spatial', 'Cluster', 'adult_pred_type', 'adult_pred_value', 'fetal_pred_type', 'fetal_pred_value', 'pDCs', 'Cell Cycle', 'Type 3 ILCs', 'DCs', 'Mast', 'Monocytes', 'Naive T-Cells', 'Venous (CP) 1', 'Venous (M) 2', 'Arterial (L)', 'Endothelium G2M-phase', 'Venous (CP) 2', 'Arterial (CP)', 'Arterial (M)', 'Endothelium S-phase', 'Proximal Progenitor', 'Proximal Mature Enterocytes', 'BEST4_OTOP2 Cells', 'Proximal TA', 'Proximal Early Enterocytes', 'Proximal Enterocytes', 'Proximal Stem Cells', 'EECs', 'Distal Enterocytes', 'Goblets', 'Distal TA', 'Distal Absorptive', 'Distal Stem Cells', 'Secretory Progenitors', 'Distal Mature Enterocytes', 'S1', 'S1 COL6A5+', 'S4 CCL21+', 'Proximal S2 (2)', 'S1 IFIT3+', 'Distal S2', 'Fibroblasts S-phase', 'Proximal S2 (1)', 'S3 Progenitor', 'Fibroblasts G2M-phase', 'S4 CXCL14+', 'Fibroblast Progenitor', 'S3 Transitional', 'Erythroid', 'S3 EBF+', 'S3 HAND1+', 'Pericytes G2M-phase', 'Pericyte Progenitors', 'Undifferentiated Pericytes', 'ICC PDGFRA+', 'MYOCD+ Muscularis', 'Muscularis S-phase', 'Muscularis G2M-phase', 'HOXP+ Proximal Muscularis', 'FOXF2+ Distal Muscularis', 'FOXF2- Muscularis', 'MORN5+ Distal Muscularis', 'Myofibroblast Progenitors', 'Myofibroblasts', 'Mesothelium SOX6+', 'Myofibroblasts S-phase', 'Myofibroblasts G2M-phase', 'Glial Progenitors', 'Excitory Motor Neuron', 'Interneuron', 'Differentiating Submucosal Glial', 'Inhibitory Motor Neuron Precursor', 'Neuroendocrine (1)', 'max', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage' + var: 'sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_mean', 'sct.residual_variance', 'sct.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype' + uns: 'adult_pred_mat', 'fetal_pred_mat', 'schema_version', 'title' + obsm: 'X_pca', 'X_spatial', 'X_umap' + layers: 'counts', 'scale.data' + + This functionality is compatible with :func:`anndata.concat` + + >>> ad.concat([b_cells_adata, fetal_adata], join="outer") + AnnData object with n_obs × n_vars = 490 × 33452 + obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Phase', 'sample', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'nCount_Spatial', 'nFeature_Spatial', 'Cluster', 'adult_pred_type', 'adult_pred_value', 'fetal_pred_type', 'fetal_pred_value', 'pDCs', 'Cell Cycle', 'Type 3 ILCs', 'DCs', 'Mast', 'Monocytes', 'Naive T-Cells', 'Venous (CP) 1', 'Venous (M) 2', 'Arterial (L)', 'Endothelium G2M-phase', 'Venous (CP) 2', 'Arterial (CP)', 'Arterial (M)', 'Endothelium S-phase', 'Proximal Progenitor', 'Proximal Mature Enterocytes', 'BEST4_OTOP2 Cells', 'Proximal TA', 'Proximal Early Enterocytes', 'Proximal Enterocytes', 'Proximal Stem Cells', 'EECs', 'Distal Enterocytes', 'Goblets', 'Distal TA', 'Distal Absorptive', 'Distal Stem Cells', 'Secretory Progenitors', 'Distal Mature Enterocytes', 'S1', 'S1 COL6A5+', 'S4 CCL21+', 'Proximal S2 (2)', 'S1 IFIT3+', 'Distal S2', 'Fibroblasts S-phase', 'Proximal S2 (1)', 'S3 Progenitor', 'Fibroblasts G2M-phase', 'S4 CXCL14+', 'Fibroblast Progenitor', 'S3 Transitional', 'Erythroid', 'S3 EBF+', 'S3 HAND1+', 'Pericytes G2M-phase', 'Pericyte Progenitors', 'Undifferentiated Pericytes', 'ICC PDGFRA+', 'MYOCD+ Muscularis', 'Muscularis S-phase', 'Muscularis G2M-phase', 'HOXP+ Proximal Muscularis', 'FOXF2+ Distal Muscularis', 'FOXF2- Muscularis', 'MORN5+ Distal Muscularis', 'Myofibroblast Progenitors', 'Myofibroblasts', 'Mesothelium SOX6+', 'Myofibroblasts S-phase', 'Myofibroblasts G2M-phase', 'Glial Progenitors', 'Excitory Motor Neuron', 'Interneuron', 'Differentiating Submucosal Glial', 'Inhibitory Motor Neuron Precursor', 'Neuroendocrine (1)', 'max' + var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_mean', 'sct.residual_variance', 'sct.variable' + obsm: 'X_harmony', 'X_pca', 'X_umap', 'X_spatial' + layers: 'counts', 'scale.data' """ is_h5_store = isinstance(store, (h5py.Dataset, h5py.File)) is_h5 = ( From cb125bfc426c922beeddb446d886e44efb240c65 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 16:09:49 +0200 Subject: [PATCH 276/348] (fix): catch warnings --- tests/test_read_lazy.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index ee396cede..162226076 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import contextmanager from importlib.util import find_spec from typing import TYPE_CHECKING @@ -272,22 +273,6 @@ def test_concat( assert_equal(concated_remote.X, concatenated_memory.X) -def test_concat_without_annotation_index(adata_remote_with_store_tall_skinny, join): - _, store = adata_remote_with_store_tall_skinny - remote = read_lazy(store, load_annotation_index=False) - orig = remote.to_memory() - with pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric"): - remote_concatenated = ad.concat([remote, remote], join=join) - orig_concatenated = ad.concat([orig, orig], join=join) - in_memory_remote_concatenated = remote_concatenated.to_memory() - corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( - in_memory_remote_concatenated.obs, orig_concatenated.obs - ) - assert_equal(corrected_remote_obs, corrected_memory_obs) - assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) - assert all(in_memory_remote_concatenated.var_names == orig_concatenated.var_names) - - @pytest.mark.parametrize( "index", [ @@ -307,13 +292,29 @@ def test_concat_without_annotation_index(adata_remote_with_store_tall_skinny, jo np.random.choice([True, False], 2000), id="boolean array", ), - pytest.param(slice(None), id="full"), + pytest.param(slice(None), id="full slice"), + pytest.param(None, id="No index"), ], ) -def test_concat_subsets(adata_remote_orig, join, index): +def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): remote, orig = adata_remote_orig - remote_concatenated = ad.concat([remote, remote], join=join)[index] - orig_concatenated = ad.concat([orig, orig], join=join)[index] + + @contextmanager + def empty_context(): + yield + + maybe_warning_context = ( + pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric") + if not load_annotation_index + else empty_context() + ) + with maybe_warning_context: + remote_concatenated = ad.concat([remote, remote], join=join) + if index is not None: + remote_concatenated = remote_concatenated[index] + orig_concatenated = ad.concat([orig, orig], join=join) + if index is not None: + orig_concatenated = orig_concatenated[index] in_memory_remote_concatenated = remote_concatenated.to_memory() corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( in_memory_remote_concatenated.obs, orig_concatenated.obs From e14f53ffa55cda7b9f62e1183bb000e47ea9863b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 16:35:48 +0200 Subject: [PATCH 277/348] (fix): format --- src/anndata/_core/merge.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 93d4b7793..2142df882 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1095,9 +1095,7 @@ def get_chunk(block_info=None): ) -def make_xarray_extension_dtypes_dask( - annotations: Iterable[Dataset2D], -): +def make_xarray_extension_dtypes_dask(annotations: Iterable[Dataset2D]): new_annotations = [] for a in annotations: From 3c5641c1549fecfe7cbaef835bf97625d326d343 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 16:59:56 +0200 Subject: [PATCH 278/348] (fix): threaded tests annotation index --- src/anndata/_io/specs/lazy_methods.py | 8 ++++++- tests/test_read_lazy.py | 33 +++++++++++++++++---------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 39cfca216..6d31bf174 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from contextlib import contextmanager from functools import partial from pathlib import Path @@ -242,7 +243,12 @@ def read_dataframe( for k in [*elem.attrs["column-order"], elem.attrs["_index"]] } elem_name = get_elem_name(elem) - label_based_indexing_key = f'{elem_name.replace("/", "")}_names' + # remove end for obsm/varm + obs_var_match = re.findall(r"(obs|var)", elem_name) + if not len(obs_var_match): + label_based_indexing_key = "index" + else: + label_based_indexing_key = f"{obs_var_match[0]}_names" if not use_range_index: index_label = label_based_indexing_key index_key = elem.attrs["_index"] diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 162226076..02ef2463a 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -21,7 +21,7 @@ ) if TYPE_CHECKING: - from typing import Literal + from typing import Callable, Literal @pytest.fixture( @@ -48,10 +48,12 @@ def join(request): return request.param +# TODO: why does `read_lazy().to_memory()` cause `Dataset2D.to_memory()` to lose index name in +# multi-threaded tests when only opened once i.e., without this Callable? @pytest.fixture(scope="session") def adata_remote_orig( tmp_path_factory, dskfmt: str, mtx_format, load_annotation_index: bool -) -> tuple[AnnData, AnnData]: +) -> tuple[Callable[[], AnnData], AnnData]: """Create remote fixtures, one without a range index and the other with""" if dskfmt == "h5ad": orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / f"orig.{dskfmt}" @@ -59,13 +61,15 @@ def adata_remote_orig( orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1100), mtx_format) getattr(orig, f"write_{dskfmt}")(orig_path) - return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig + return lambda: read_lazy( + orig_path, load_annotation_index=load_annotation_index + ), orig @pytest.fixture def adata_remote_with_store_tall_skinny( tmp_path_factory, mtx_format -) -> tuple[AnnData, AccessTrackingStore]: +) -> tuple[Callable[[], AnnData], AccessTrackingStore]: orig_path = tmp_path_factory.mktemp("orig.zarr") M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 @@ -80,8 +84,7 @@ def adata_remote_with_store_tall_skinny( ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) - remote = read_lazy(store) - return remote, store + return lambda: read_lazy(store), store pytestmark = pytest.mark.skipif( @@ -90,7 +93,8 @@ def adata_remote_with_store_tall_skinny( def test_access_count_obs_var(adata_remote_with_store_tall_skinny): - remote, store = adata_remote_with_store_tall_skinny + remote_generator, store = adata_remote_with_store_tall_skinny + remote = remote_generator() store.initialize_key_trackers( ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X"] ) @@ -133,7 +137,8 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): def test_access_count_dtype(adata_remote_with_store_tall_skinny): - remote, store = adata_remote_with_store_tall_skinny + remote_generator, store = adata_remote_with_store_tall_skinny + remote = remote_generator() store.initialize_key_trackers(["obs/cat/categories"]) store.assert_access_count("obs/cat/categories", 0) # This should only cause categories to be read in once @@ -144,13 +149,15 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): def test_to_memory(adata_remote_orig): - remote, orig = adata_remote_orig + remote_generator, orig = adata_remote_orig + remote = remote_generator() remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) def test_view_to_memory(adata_remote_orig): - remote, orig = adata_remote_orig + remote_generator, orig = adata_remote_orig + remote = remote_generator() subset_obs = orig.obs["obs_cat"] == "a" assert_equal(orig[subset_obs, :], remote[subset_obs, :].to_memory()) @@ -159,7 +166,8 @@ def test_view_to_memory(adata_remote_orig): def test_view_of_view_to_memory(adata_remote_orig): - remote, orig = adata_remote_orig + remote_generator, orig = adata_remote_orig + remote = remote_generator() subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") subsetted_adata = orig[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" @@ -297,7 +305,8 @@ def test_concat( ], ) def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): - remote, orig = adata_remote_orig + remote_generator, orig = adata_remote_orig + remote = remote_generator() @contextmanager def empty_context(): From dbe09ca857fcbd588881f38f10da355fb296e746 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 17:26:49 +0200 Subject: [PATCH 279/348] (fix): remove xarray test from minimum deps --- .azure-pipelines.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index ffb7e5dc4..725eeb79e 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -76,6 +76,10 @@ jobs: displayName: "PyTest" condition: eq(variables['TEST_TYPE'], 'standard') + - script: pytest -k "not anndata.experimental.backed" + displayName: "PyTest (minimum)" + condition: eq(variables['DEPENDENCIES_VERSION'], 'minimum') + - script: pytest --cov --cov-report=xml --cov-context=test displayName: "PyTest (coverage)" condition: eq(variables['TEST_TYPE'], 'coverage') From 67fc5469cad220779ca3508972a6984adb06e562 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 17:59:43 +0200 Subject: [PATCH 280/348] (fix): skip experimental backed tests if xarray not installed --- .azure-pipelines.yml | 2 +- src/testing/anndata/_pytest.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 725eeb79e..d2d4fb81e 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -76,7 +76,7 @@ jobs: displayName: "PyTest" condition: eq(variables['TEST_TYPE'], 'standard') - - script: pytest -k "not anndata.experimental.backed" + - script: pytest displayName: "PyTest (minimum)" condition: eq(variables['DEPENDENCIES_VERSION'], 'minimum') diff --git a/src/testing/anndata/_pytest.py b/src/testing/anndata/_pytest.py index 5b0fd60e0..ecca3348d 100644 --- a/src/testing/anndata/_pytest.py +++ b/src/testing/anndata/_pytest.py @@ -9,6 +9,7 @@ from __future__ import annotations +import importlib import re import warnings from typing import TYPE_CHECKING, cast @@ -55,6 +56,9 @@ def _doctest_env( from anndata.utils import import_name assert isinstance(request.node.parent, pytest.Module) + if "experimental/backed" in str(request.node.path): + if importlib.util.find_spec("xarray") is None: + pytest.skip("xarray not installed") # request.node.parent is either a DoctestModule or a DoctestTextFile. # Only DoctestModule has a .obj attribute (the imported module). if request.node.parent.obj: @@ -63,7 +67,6 @@ def _doctest_env( if warning_detail := getattr(func, "__deprecated", None): cat, msg, _ = warning_detail warnings.filterwarnings("ignore", category=cat, message=re.escape(msg)) - old_dd, settings.datasetdir = settings.datasetdir, cache.mkdir("scanpy-data") with chdir(tmp_path): yield From 41a933546ee88c7ed83c8cd65d08c2ec25f2c7c1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 2 Oct 2024 18:07:12 +0200 Subject: [PATCH 281/348] (chore): remove todo --- src/anndata/_core/merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 2142df882..b7dc887e0 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -769,7 +769,6 @@ def np_bool_to_pd_bool_array(df: pd.DataFrame): return df -# TODO: concat for xarray def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): from anndata.experimental.backed._compat import Dataset From 58e595bb9c9e362d96acfa6620b304921de1b73d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 12:20:41 +0200 Subject: [PATCH 282/348] (feat): add `uns` reading --- src/anndata/experimental/backed/_io.py | 9 +++++++-- tests/test_read_lazy.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index eaba44d93..726a9e99b 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -17,6 +17,9 @@ if TYPE_CHECKING: from collections.abc import MutableMapping + from anndata._io.specs.registry import IOSpec + from anndata._types import Read, StorageType + from ...compat import ZarrGroup @@ -26,7 +29,7 @@ def read_lazy( ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. - No array data should need to be read into memory with the exception of :class:`awkward.Array` and some older-encoding string arrays. + No array data should need to be read into memory with the exception of :class:`awkward.Array`, scalars, and some older-encoding arrays. Parameters ---------- @@ -110,7 +113,7 @@ def read_lazy( else: f = h5py.File(store, mode="r") - def callback(func, elem_name: str, elem, iospec): + def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): cols = [ "obs", @@ -147,6 +150,8 @@ def callback(func, elem_name: str, elem, iospec): return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) + elif iospec.encoding_type == "dict": + return {k: read_dispatched(v, callback=callback) for k, v in elem.items()} return func(elem) with settings.override(check_uniqueness=load_annotation_index): diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 02ef2463a..3c5c5c911 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -11,6 +11,7 @@ import anndata as ad from anndata import AnnData +from anndata.compat import DaskArray from anndata.experimental import read_lazy from anndata.tests.helpers import ( AccessTrackingStore, @@ -151,6 +152,7 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): def test_to_memory(adata_remote_orig): remote_generator, orig = adata_remote_orig remote = remote_generator() + assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) From 8875374dfe4a6ba8c8df5de514f5fb34d9c84ba1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 12:31:30 +0200 Subject: [PATCH 283/348] (feat): add `Raw` reading + tests --- src/anndata/experimental/backed/_io.py | 4 +--- tests/test_read_lazy.py | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 726a9e99b..d7bd3f48e 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -114,7 +114,7 @@ def read_lazy( f = h5py.File(store, mode="r") def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec): - if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + if iospec.encoding_type in {"anndata", "raw"} or elem_name.endswith("/"): cols = [ "obs", "var", @@ -131,8 +131,6 @@ def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] ) return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) - elif elem_name.startswith("/raw"): - return None elif ( iospec.encoding_type in { diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 3c5c5c911..47a0120db 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -61,6 +61,7 @@ def adata_remote_orig( else: orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") orig = gen_adata((1000, 1100), mtx_format) + orig.raw = gen_adata((1000, 1100), mtx_format) getattr(orig, f"write_{dskfmt}")(orig_path) return lambda: read_lazy( orig_path, load_annotation_index=load_annotation_index @@ -83,6 +84,7 @@ def adata_remote_with_store_tall_skinny( var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), ) + orig.raw = orig.copy() orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) return lambda: read_lazy(store), store @@ -97,13 +99,16 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): remote_generator, store = adata_remote_with_store_tall_skinny remote = remote_generator() store.initialize_key_trackers( - ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X"] + ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X", "raw"] ) # a series of methods that should __not__ read in any data remote.X # the initial (non-subset) access to `X` should not read in data remote.shape remote.var remote.obs + remote.raw + remote.raw.var + remote.raw.X remote.obs["int64"] remote.obs["int64"] remote.obs["cat"] @@ -124,6 +129,7 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): store.assert_access_count("obs/int64", 1) # .zmetadata handles .zarray so simple access does not cause any read store.assert_access_count("var/int64", 0) + store.assert_access_count("raw", 0) def test_access_count_index(adata_remote_with_store_tall_skinny): From 4c991d4ae7908aec2c40c3bcbaabbc469718a856 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 12:41:17 +0200 Subject: [PATCH 284/348] (chore): make `test-full` shorter --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3f01d5796..4d5a7e65e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ doc = [ "anndata[dev-doc]", ] dev-doc = ["towncrier>=24.8.0"] # release notes tool -test-full = ["anndata[test]", "anndata[lazy]"] +test-full = ["anndata[test,lazy]"] test = [ "loompy>=3.0.5", "pytest>=8.2", From 50cdc66e7b79571944871c1501474d6c28da2790 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 12:47:29 +0200 Subject: [PATCH 285/348] (fix): stricter type checking --- src/anndata/_core/merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index b7dc887e0..3b3688ea7 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -770,16 +770,16 @@ def np_bool_to_pd_bool_array(df: pd.DataFrame): def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): - from anndata.experimental.backed._compat import Dataset + from anndata.experimental.backed._xarray import Dataset2D arrays = list(arrays) if fill_value is None: fill_value = default_fill_value(arrays) - if any(isinstance(a, Dataset) for a in arrays): - if not all(isinstance(a, Dataset) for a in arrays): + if any(isinstance(a, Dataset2D) for a in arrays): + if not all(isinstance(a, Dataset2D) for a in arrays): raise NotImplementedError( - "Cannot concatenate a Dataset with other array types." + "Cannot concatenate a Dataset2D with other array types." ) return concat_dataset2d_on_annot_axis(arrays, join="outer") if any(isinstance(a, pd.DataFrame) for a in arrays): From eb881a925f1440c7be701ad7e876d9a847f21d0c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 13:46:25 +0200 Subject: [PATCH 286/348] (fix): dtype casting for concat --- src/anndata/_core/merge.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 3b3688ea7..833b9e757 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1073,7 +1073,7 @@ def concat_Xs(adatas, reindexers, axis, fill_value): return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) -def make_dask_col_from_extension_dtype(col): +def make_dask_col_from_extension_dtype(col, use_only_object_dtype: bool = False): import dask.array as da from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size @@ -1084,17 +1084,22 @@ def get_chunk(block_info=None): ) return np.array(col.data[idx].array) - # TODO: fix dtype - dtype = "object" + if col.dtype == "category" or use_only_object_dtype: + dtype = "object" + else: + dtype = col.dtype.numpy_dtype # TODO: get good chunk size? return da.map_blocks( get_chunk, chunks=(compute_chunk_layout_for_axis_size(1000, col.shape[0]),), meta=np.array([], dtype=dtype), + dtype=dtype, ) -def make_xarray_extension_dtypes_dask(annotations: Iterable[Dataset2D]): +def make_xarray_extension_dtypes_dask( + annotations: Iterable[Dataset2D], use_only_object_dtype=False +): new_annotations = [] for a in annotations: @@ -1106,7 +1111,9 @@ def make_xarray_extension_dtypes_dask(annotations: Iterable[Dataset2D]): a.copy( data={ **{ - col: make_dask_col_from_extension_dtype(a[col]) + col: make_dask_col_from_extension_dtype( + a[col], use_only_object_dtype + ) for col in extension_cols }, **{col: a[col] for col in a.columns if col not in extension_cols}, @@ -1442,7 +1449,10 @@ def concat( else: # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be # the only "lazy" setting so I'm not sure we really want that. - annotations_with_only_dask = make_xarray_extension_dtypes_dask(alt_annotations) + # Because of xarray's merge upcasting, it's safest to simply assume that all dtypes are objects. + annotations_with_only_dask = make_xarray_extension_dtypes_dask( + alt_annotations, use_only_object_dtype=True + ) attrs = get_attrs(annotations_with_only_dask) alt_annot = Dataset2D( xr.merge(annotations_with_only_dask, join=join, compat="override"), From fe1f0a6ca22ef493b42a87bd3eb160bf9764659f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 13:56:08 +0200 Subject: [PATCH 287/348] (chore): separate into two cleaner unit tests --- src/anndata/experimental/backed/_xarray.py | 2 +- tests/test_read_lazy.py | 78 +++++++++++++++------- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index a42ca59b2..0f6acc834 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -66,7 +66,7 @@ def __init__(self, ds): self._ds = ds def __getitem__(self, idx): - coord = list(self._ds.coords.keys())[0] + coord = get_index_dim(self._ds) return self._ds.isel(**{coord: idx}) return IlocGetter(self) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 47a0120db..6c758443b 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -220,18 +220,16 @@ def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFra @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize("are_vars_different", [True, False]) -def test_concat( - tmp_path, join: Literal["outer", "inner"], are_vars_different: bool, mtx_format +def test_concat_access_count( + tmp_path, join: Literal["outer", "inner"], are_vars_different: bool ): - from anndata.experimental.backed._compat import Dataset - lazy_adatas = [] adatas = [] stores: list[AccessTrackingStore] = [] var_indices = [] M = 1000 N = 50 - n_datasets = 2 + n_datasets = 3 for dataset_index in range(n_datasets): orig_path = tmp_path / f"orig_{dataset_index}.zarr" orig_path.mkdir() @@ -246,16 +244,15 @@ def test_concat( orig = AnnData( obs=obs, var=var, - X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), + X=np.random.binomial(100, 0.005, (M, N)).astype(np.float32), ) orig.write_zarr(orig_path) store = AccessTrackingStore(orig_path) - store.initialize_key_trackers(["obs/int64", "var/int64", "X"]) + store.initialize_key_trackers(["obs/int64", "X", "var/int64"]) lazy_adatas += [read_lazy(store)] adatas += [orig] stores += [store] concated_remote = ad.concat(lazy_adatas, join=join) - assert isinstance(concated_remote.obs, Dataset) for i in range(n_datasets): stores[i].assert_access_count("obs/int64", 0) stores[i].assert_access_count("X", 0) @@ -269,24 +266,46 @@ def test_concat( assert_equal( *correct_extension_dtype_differences( - concated_remote.obs.to_pandas(), obs_memory + concated_remote[:M].obs.to_pandas(), concatenated_memory[:M].obs ) ) - # check non-different variables, taken from first annotation. all others are null so incomparable - pd_index = pd.Index(filter(lambda x: not x.endswith("ds"), var_indices[0])) - var_df = adatas[0][:, pd_index].var.copy() - var_df.index.name = "var_names" - remote_df_corrected, _ = correct_extension_dtype_differences( - concated_remote[:, pd_index].var.to_pandas(), var_df - ) - # TODO:xr.merge always upcasts to float due to NA and you can't downcast? - for col in remote_df_corrected.columns: - dtype = remote_df_corrected[col].dtype - if dtype in [np.float64, np.float32]: - var_df[col] = var_df[col].astype(dtype) - assert_equal(remote_df_corrected, var_df) + # check access count for the stores - only the first should be accessed + stores[0].assert_access_count("obs/int64", 1) + for i in range(1, n_datasets): + stores[i].assert_access_count("obs/int64", 0) + + # subsetting should not read data into memory + concated_remote[:M].X + for i in range(n_datasets): + stores[i].assert_access_count("X", 0) - assert_equal(concated_remote.X, concatenated_memory.X) + # check non-different variables, taken from first annotation. + pd_index_overlapping = pd.Index( + filter(lambda x: not x.endswith("ds"), var_indices[0]) + ) + var_df_overlapping = adatas[0][:, pd_index_overlapping].var.copy() + test_cases = [(pd_index_overlapping, var_df_overlapping, 0)] + if are_vars_different and join == "outer": + # check a set of unique variables from the first object since we only take from there if different + pd_index_only_ds_0 = pd.Index(filter(lambda x: "0_ds" in x, var_indices[1])) + var_df_only_ds_0 = adatas[0][:, pd_index_only_ds_0].var.copy() + test_cases.append((pd_index_only_ds_0, var_df_only_ds_0, 0)) + for pd_index, var_df, store_idx in test_cases: + var_df.index.name = "var_names" + remote_df = concated_remote[:, pd_index].var.to_pandas() + remote_df_corrected, _ = correct_extension_dtype_differences(remote_df, var_df) + # TODO:xr.merge always upcasts to float due to NA and you can't downcast? + for col in remote_df_corrected.columns: + dtype = remote_df_corrected[col].dtype + if dtype in [np.float64, np.float32]: + var_df[col] = var_df[col].astype(dtype) + assert_equal(remote_df_corrected, var_df) + + stores[store_idx].assert_access_count("var/int64", 1) + for store in stores: + if store != stores[store_idx]: + store.assert_access_count("var/int64", 0) + stores[store_idx].reset_key_trackers() @pytest.mark.parametrize( @@ -309,10 +328,13 @@ def test_concat( id="boolean array", ), pytest.param(slice(None), id="full slice"), + pytest.param("a", id="categorical_subset"), pytest.param(None, id="No index"), ], ) def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): + from anndata.experimental.backed._xarray import Dataset2D + remote_generator, orig = adata_remote_orig remote = remote_generator() @@ -328,7 +350,17 @@ def empty_context(): with maybe_warning_context: remote_concatenated = ad.concat([remote, remote], join=join) if index is not None: + if np.isscalar(index) and index == "a": + index = remote_concatenated.obs["obs_cat"] == "a" remote_concatenated = remote_concatenated[index] + assert isinstance(remote_concatenated.obs, Dataset2D) + # check preservation of non-categorical dtypes on the concat axis + assert remote_concatenated.obs["int64"].dtype == "int64" + assert remote_concatenated.obs["uint8"].dtype == "uint8" + assert remote_concatenated.obs["nullable-int"].dtype == "int32" + assert remote_concatenated.obs["float64"].dtype == "float64" + assert remote_concatenated.obs["bool"].dtype == "bool" + assert remote_concatenated.obs["nullable-bool"].dtype == "bool" orig_concatenated = ad.concat([orig, orig], join=join) if index is not None: orig_concatenated = orig_concatenated[index] From c0c0c6c57ffddfd6a11b4b5109c2ef4bcb874885 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 13:58:29 +0200 Subject: [PATCH 288/348] (fix): typing of `make_xarray_extension_dtypes_dask --- src/anndata/_core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 833b9e757..ab9b145e9 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1098,7 +1098,7 @@ def get_chunk(block_info=None): def make_xarray_extension_dtypes_dask( - annotations: Iterable[Dataset2D], use_only_object_dtype=False + annotations: Iterable[Dataset2D], use_only_object_dtype: bool = False ): new_annotations = [] From fc720113ae0012d7773dc9229ee1466088f98406 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 14:03:38 +0200 Subject: [PATCH 289/348] (chore): remove comment --- src/anndata/_core/storage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 7563de941..0a68e8676 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -26,7 +26,6 @@ def coerce_array( allow_array_like: bool = False, ): try: - # Needs to be done here to prevent circular imports, and StorageType is immutable from anndata.experimental.backed._xarray import Dataset2D except ImportError: From 562817ddccbdee178c61e4d8d2358f90afec0ffa Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 14:44:52 +0200 Subject: [PATCH 290/348] (fix): clean up compat --- src/anndata/_core/index.py | 4 ++-- src/anndata/_core/merge.py | 12 ++++++------ src/anndata/_core/storage.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 18 +++++++++--------- src/anndata/_io/specs/registry.py | 2 +- src/anndata/experimental/backed/_compat.py | 12 +++--------- .../experimental/backed/_lazy_arrays.py | 3 ++- src/anndata/experimental/backed/_xarray.py | 14 +++++++++++--- tests/test_read_lazy.py | 2 +- 9 files changed, 36 insertions(+), 33 deletions(-) diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 72a075456..ae3653196 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -50,7 +50,7 @@ def _normalize_index( | pd.Index, index: pd.Index, ) -> slice | int | np.ndarray: # ndarray of int or bool - from ..experimental.backed._compat import xr + from ..experimental.backed._compat import DataArray # TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough. # if not isinstance(index, pd.RangeIndex): @@ -110,7 +110,7 @@ def name_idx(i): "are not valid obs/ var names or indices." ) return positions # np.ndarray[int] - elif isinstance(indexer, xr.DataArray): + elif isinstance(indexer, DataArray): if isinstance(indexer.data, DaskArray): return indexer.data.compute() return indexer.data diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index ab9b145e9..d9f266309 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -42,7 +42,7 @@ from pandas.api.extensions import ExtensionDtype - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D Join_T = Literal["inner", "outer"] @@ -770,7 +770,7 @@ def np_bool_to_pd_bool_array(df: pd.DataFrame): def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D arrays = list(arrays) if fill_value is None: @@ -1142,8 +1142,8 @@ def concat_dataset2d_on_annot_axis( annotations: Iterable[Dataset2D], join: Join_T, ): - from anndata.experimental.backed._compat import xr - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D + from anndata.experimental.backed._compat import xarray as xr annotations_with_only_dask = make_xarray_extension_dtypes_dask(annotations) attrs = get_attrs(annotations_with_only_dask) @@ -1357,8 +1357,8 @@ def concat( {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ - from anndata.experimental.backed._compat import xr - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D + from anndata.experimental.backed._compat import xarray as xr # Argument normalization merge = resolve_merge_strategy(merge) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 0a68e8676..047ef4f02 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -26,7 +26,7 @@ def coerce_array( allow_array_like: bool = False, ): try: - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D except ImportError: class Dataset2D: diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6d31bf174..f08ecc538 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -21,9 +21,8 @@ from collections.abc import Callable, Generator, Iterator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar - from anndata.experimental.backed._compat import xr + from anndata.experimental.backed._compat import DataArray, Dataset2D from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray - from anndata.experimental.backed._xarray import Dataset2D from ..._core.sparse_dataset import _CSCDataset, _CSRDataset from ..._types import ArrayStorageType, StorageType @@ -195,26 +194,27 @@ def _gen_xarray_dict_iterator_from_elems( index_label: str, index_key: str, index: np.NDArray, -) -> Iterator[tuple[str, xr.DataArray]]: - from anndata.experimental.backed._compat import xr +) -> Iterator[tuple[str, DataArray]]: + from anndata.experimental.backed._compat import DataArray + from anndata.experimental.backed._compat import xarray as xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray for k, v in elem_dict.items(): data_array_name = k if isinstance(v, DaskArray) and k != index_key: - data_array = xr.DataArray(v, coords=[index], dims=[index_label], name=k) + data_array = DataArray(v, coords=[index], dims=[index_label], name=k) elif isinstance(v, (CategoricalArray, MaskedArray)) and k != index_key: variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] ) - data_array = xr.DataArray( + data_array = DataArray( variable, coords=[index], dims=[index_label], name=k, ) elif k == index_key: - data_array = xr.DataArray( + data_array = DataArray( index, coords=[index], dims=[index_label], name=index_label ) data_array_name = index_label @@ -224,7 +224,7 @@ def _gen_xarray_dict_iterator_from_elems( if index_key == DUMMY_RANGE_INDEX_KEY: yield ( index_label, - xr.DataArray(index, coords=[index], dims=[index_label], name=index_label), + DataArray(index, coords=[index], dims=[index_label], name=index_label), ) @@ -236,7 +236,7 @@ def read_dataframe( _reader: LazyReader, use_range_index: bool = False, ) -> Dataset2D: - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D elem_dict = { k: _reader.read_elem(elem[k]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 9b3001066..a93ac61a9 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -24,8 +24,8 @@ WriteCallback, _WriteInternal, ) + from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray - from anndata.experimental.backed._xarray import Dataset2D from anndata.typing import RWAble T = TypeVar("T") diff --git a/src/anndata/experimental/backed/_compat.py b/src/anndata/experimental/backed/_compat.py index 037306698..634aa76c5 100644 --- a/src/anndata/experimental/backed/_compat.py +++ b/src/anndata/experimental/backed/_compat.py @@ -10,9 +10,9 @@ def __repr__(self) -> str: try: - import xarray as xr + import xarray except ImportError: - xr = None + xarray = None try: @@ -33,10 +33,4 @@ def __repr__(self) -> str: return "mock BackendArray" -try: - from xarray import Dataset -except ImportError: - - class Dataset: - def __repr__(self) -> str: - return "mock Dataset" +from ._xarray import Dataset, Dataset2D # noqa: F401 diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 34948f327..a99f55824 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -10,7 +10,8 @@ from anndata.compat import H5Array, ZarrArray from ..._settings import settings -from ._compat import BackendArray, DataArray, ZarrArrayWrapper, xr +from ._compat import BackendArray, DataArray, ZarrArrayWrapper +from ._compat import xarray as xr if TYPE_CHECKING: from anndata._core.index import Index diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 0f6acc834..4aa9adbdb 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -8,17 +8,25 @@ from ..._core.file_backing import to_memory from ..._core.index import _subset from ..._core.views import as_view -from ._compat import Dataset + +try: + from xarray import Dataset +except ImportError: + + class Dataset: + def __repr__(self) -> str: + return "mock Dataset" + if TYPE_CHECKING: from collections.abc import Hashable, Iterable from typing import Any, Literal from ..._core.index import Index - from ._compat import DataArray + from ._compat import xarray as xr -def get_index_dim(ds: DataArray) -> Hashable: +def get_index_dim(ds: xr.DataArray) -> Hashable: assert ( len(ds.sizes) == 1 ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 6c758443b..bb787a7c4 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -333,7 +333,7 @@ def test_concat_access_count( ], ) def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): - from anndata.experimental.backed._xarray import Dataset2D + from anndata.experimental.backed._compat import Dataset2D remote_generator, orig = adata_remote_orig remote = remote_generator() From 9ef7cf5c91c20c31f242c98fb99c576e8b6edabe Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Oct 2024 14:59:54 +0200 Subject: [PATCH 291/348] (chore): xarray raises import error for `read_lazy` --- src/anndata/experimental/backed/_io.py | 6 ++++++ tests/test_readwrite.py | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index d7bd3f48e..26619fdfc 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -91,6 +91,12 @@ def read_lazy( obsm: 'X_harmony', 'X_pca', 'X_umap', 'X_spatial' layers: 'counts', 'scale.data' """ + try: + import xarray # noqa: F401 + except ImportError: + raise ImportError( + "xarray is required to use the `read_lazy` function. Please install xarray." + ) is_h5_store = isinstance(store, (h5py.Dataset, h5py.File)) is_h5 = ( isinstance(store, (Path, str)) and Path(store).suffix == ".h5ad" diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 04d20d272..67921da0e 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -847,3 +847,11 @@ def test_h5py_attr_limit(tmp_path): np.ones((5, N)), index=a.obs_names, columns=[str(i) for i in range(N)] ) a.write(tmp_path / "tmp.h5ad") + + +@pytest.mark.skipif( + find_spec("xarray"), reason="Xarray is installed so `read_lazy` will not error" +) +def test_read_lazy_import_error(): + with pytest.raises(ImportError, match="xarray"): + ad.experimental.read_lazy("test.zarr") From 69b6cc1919c2c8de255bee09fe61bc996f7829ba Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 14 Oct 2024 10:25:15 +0200 Subject: [PATCH 292/348] (fix) ignore some uncovered lines --- src/anndata/_core/aligned_df.py | 2 +- src/anndata/_core/merge.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 9fc6efb5b..ea66a7fda 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -23,7 +23,7 @@ def _gen_dataframe( source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, -) -> pd.DataFrame: +) -> pd.DataFrame: # pragma: no cover raise ValueError(f"Cannot convert {type(anno)} to {attr} DataFrame") diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index d9f266309..bd2bee129 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -4,14 +4,13 @@ from __future__ import annotations -import typing import warnings from collections import OrderedDict from collections.abc import Callable, Mapping, MutableSet from functools import partial, reduce, singledispatch from itertools import repeat from operator import and_, or_, sub -from typing import Literal, TypeVar +from typing import TYPE_CHECKING, Literal, TypeVar from warnings import warn import numpy as np @@ -36,7 +35,7 @@ from .anndata import AnnData from .index import _subset, make_slice -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from collections.abc import Collection, Iterable, Sequence from typing import Any @@ -1154,7 +1153,7 @@ def concat_dataset2d_on_annot_axis( def concat( - adatas: Collection[AnnData] | typing.Mapping[str, AnnData], + adatas: Collection[AnnData] | Mapping[str, AnnData], *, axis: Literal["obs", 0, "var", 1] = "obs", join: Join_T = "inner", From 3dfefc495004de129befaa1896c72a83b83bd76f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 14 Oct 2024 10:27:58 +0200 Subject: [PATCH 293/348] use existing warn import --- src/anndata/_core/merge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index bd2bee129..7117fb5a8 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -4,7 +4,6 @@ from __future__ import annotations -import warnings from collections import OrderedDict from collections.abc import Callable, Mapping, MutableSet from functools import partial, reduce, singledispatch @@ -1127,7 +1126,7 @@ def get_attrs(annotations: Iterable[Dataset2D]) -> dict: assert len(index_names) == 1, "All annotations must have the same index name." if any(a.index.dtype == "int64" for a in annotations): msg = "Concatenating with a pandas numeric index among the indices. Index may likely not be unique." - warnings.warn(msg, UserWarning) + warn(msg, UserWarning) index_keys = [ a.attrs["indexing_key"] for a in annotations if "indexing_key" in a.attrs ] From 1eb440ef4c5eedde308de84e9015aac628f672d9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 12:04:07 +0200 Subject: [PATCH 294/348] merge --- .azure-pipelines.yml | 6 +- .github/workflows/test-gpu.yml | 12 +++- docs/release-notes/0.11.0rc3.md | 6 ++ docs/tutorials/notebooks | 2 +- hatch.toml | 2 +- pyproject.toml | 5 +- src/anndata/_core/aligned_mapping.py | 21 ++++--- src/anndata/_core/anndata.py | 14 ++--- src/anndata/_core/index.py | 10 ++-- src/anndata/_core/merge.py | 16 +++--- src/anndata/_core/raw.py | 8 +-- src/anndata/_core/sparse_dataset.py | 2 +- src/anndata/_io/h5ad.py | 4 +- src/anndata/_io/read.py | 2 +- src/anndata/_io/utils.py | 7 ++- src/anndata/_types.py | 8 +-- src/anndata/compat/__init__.py | 24 ++------ src/anndata/experimental/merge.py | 2 +- .../multi_files/_anncollection.py | 4 +- src/anndata/tests/helpers.py | 4 +- src/anndata/typing.py | 55 +++++++++---------- tests/test_backed_hdf5.py | 4 +- tests/test_concatenate.py | 6 +- tests/test_io_dispatched.py | 2 +- tests/test_readwrite.py | 10 ++-- 25 files changed, 119 insertions(+), 117 deletions(-) create mode 100644 docs/release-notes/0.11.0rc3.md diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index d2d4fb81e..4464d6d66 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -18,14 +18,14 @@ jobs: python.version: "3.12" RUN_COVERAGE: yes TEST_TYPE: "coverage" - Python3.9: - python.version: "3.9" + Python3.10: + python.version: "3.10" PreRelease: python.version: "3.12" DEPENDENCIES_VERSION: "pre-release" TEST_TYPE: "strict-warning" minimum_versions: - python.version: "3.9" + python.version: "3.10" DEPENDENCIES_VERSION: "minimum" TEST_TYPE: "coverage" steps: diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index 4283ac780..02f681c4d 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -51,10 +51,20 @@ jobs: - name: Nvidia SMI sanity check run: nvidia-smi + - name: Install yq + run: | + sudo snap install yq + + - name: Extract max Python version from classifiers + run: | + classifiers=$(yq .project.classifiers pyproject.toml -oy | grep --only-matching --perl-regexp '(?<=Python :: )(\d\.\d+)') + max_version=$(echo "$classifiers" | sort -V | tail -1) + echo "max_python_version=$max_version" >> $GITHUB_ENV + - name: Install Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: ${{ env.max_python_version }} - name: Install UV uses: hynek/setup-cached-uv@v2 diff --git a/docs/release-notes/0.11.0rc3.md b/docs/release-notes/0.11.0rc3.md new file mode 100644 index 000000000..417b003cb --- /dev/null +++ b/docs/release-notes/0.11.0rc3.md @@ -0,0 +1,6 @@ +(v0.11.0rc3)= +### 0.11.0rc3 {small}`2024-10-14` + +### Breaking changes + +- Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 0af6cf336..9e186c5c6 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 0af6cf3363aed1cafd317516c8393136ee6287ae +Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 diff --git a/hatch.toml b/hatch.toml index ecbe889d4..738056567 100644 --- a/hatch.toml +++ b/hatch.toml @@ -21,7 +21,7 @@ overrides.matrix.deps.env-vars = [ { key = "UV_RESOLUTION", value = "lowest-direct", if = ["min"] }, ] overrides.matrix.deps.python = [ - { if = ["min"], value = "3.9" }, + { if = ["min"], value = "3.10" }, { if = ["stable", "pre"], value = "3.12" }, ] diff --git a/pyproject.toml b/pyproject.toml index 4d5a7e65e..e70237591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = ["hatchling", "hatch-vcs"] [project] name = "anndata" description = "Annotated data." -requires-python = ">=3.9" +requires-python = ">=3.10" license = "BSD-3-Clause" authors = [ { name = "Philipp Angerer" }, @@ -29,7 +29,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -43,7 +42,7 @@ dependencies = [ "numpy>=1.23", # https://github.com/scverse/anndata/issues/1434 "scipy >1.8", - "h5py>=3.1", + "h5py>=3.6", "exceptiongroup; python_version<'3.11'", "natsort", "packaging>=20.0", diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index e2f6e4352..9df5ac977 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -5,7 +5,7 @@ from collections.abc import MutableMapping, Sequence from copy import copy from dataclasses import dataclass -from typing import TYPE_CHECKING, Generic, TypeVar, Union +from typing import TYPE_CHECKING, Generic, TypeVar import numpy as np import pandas as pd @@ -33,10 +33,10 @@ from .raw import Raw -OneDIdx = Union[Sequence[int], Sequence[bool], slice] +OneDIdx = Sequence[int] | Sequence[bool] | slice TwoDIdx = tuple[OneDIdx, OneDIdx] # TODO: pd.DataFrame only allowed in AxisArrays? -Value = Union[pd.DataFrame, spmatrix, np.ndarray] +Value = pd.DataFrame | spmatrix | np.ndarray P = TypeVar("P", bound="AlignedMappingBase") """Parent mapping an AlignedView is based on.""" @@ -376,9 +376,14 @@ class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArray PairwiseArraysBase._actual_class = PairwiseArrays -AlignedMapping = Union[ - AxisArrays, AxisArraysView, Layers, LayersView, PairwiseArrays, PairwiseArraysView -] +AlignedMapping = ( + AxisArrays + | AxisArraysView + | Layers + | LayersView + | PairwiseArrays + | PairwiseArraysView +) T = TypeVar("T", bound=AlignedMapping) """Pair of types to be aligned.""" @@ -408,9 +413,7 @@ def fget(self) -> Callable[[], None]: def fake(): ... - fake.__annotations__ = { - "return": Union[self.cls._actual_class, self.cls._view_class] - } + fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class} return fake def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T: diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index a11a709ec..46c286f9e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -271,12 +271,12 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): "that is, you cannot make a view of a view." ) self._is_view = True - if isinstance(oidx, (int, np.integer)): + if isinstance(oidx, int | np.integer): if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): raise IndexError(f"Observation index `{oidx}` is out of range.") oidx += adata_ref.n_obs * (oidx < 0) oidx = slice(oidx, oidx + 1, 1) - if isinstance(vidx, (int, np.integer)): + if isinstance(vidx, int | np.integer): if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): raise IndexError(f"Variable index `{vidx}` is out of range.") vidx += adata_ref.n_vars * (vidx < 0) @@ -405,7 +405,7 @@ def _init_as_actual( # as in readwrite.read_10x_h5 if X.dtype != np.dtype(dtype): X = X.astype(dtype) - elif isinstance(X, (ZarrArray, DaskArray)): + elif isinstance(X, ZarrArray | DaskArray): X = X.astype(dtype) else: # is np.ndarray or a subclass, convert to true np.ndarray X = np.asarray(X, dtype) @@ -762,16 +762,14 @@ def _prep_dim_index(self, value, attr: str) -> pd.Index: raise ValueError( f"Length of passed value for {attr}_names is {len(value)}, but this AnnData has shape: {self.shape}" ) - if isinstance(value, pd.Index) and not isinstance( - value.name, (str, type(None)) - ): + if isinstance(value, pd.Index) and not isinstance(value.name, str | type(None)): raise ValueError( f"AnnData expects .{attr}.index.name to be a string or None, " f"but you passed a name of type {type(value.name).__name__!r}" ) else: value = pd.Index(value) - if not isinstance(value.name, (str, type(None))): + if not isinstance(value.name, str | type(None)): value.name = None if ( len(value) > 0 @@ -1977,7 +1975,7 @@ def chunk_X( if isinstance(select, int): select = select if select < self.n_obs else self.n_obs choice = np.random.choice(self.n_obs, select, replace) - elif isinstance(select, (np.ndarray, Sequence)): + elif isinstance(select, np.ndarray | Sequence): choice = np.asarray(select) else: raise ValueError("select should be int or array") diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index ae3653196..b5b8b1bd2 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -73,25 +73,25 @@ def name_idx(i): stop = None if stop is None else stop + 1 step = indexer.step return slice(start, stop, step) - elif isinstance(indexer, (np.integer, int)): + elif isinstance(indexer, np.integer | int): return indexer elif isinstance(indexer, str): return index.get_loc(indexer) # int elif isinstance( - indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix, SpArray) + indexer, Sequence | np.ndarray | pd.Index | spmatrix | np.matrix | SpArray ): if hasattr(indexer, "shape") and ( (indexer.shape == (index.shape[0], 1)) or (indexer.shape == (1, index.shape[0])) ): - if isinstance(indexer, (spmatrix, SpArray)): + if isinstance(indexer, spmatrix | SpArray): indexer = indexer.toarray() indexer = np.ravel(indexer) - if not isinstance(indexer, (np.ndarray, pd.Index)): + if not isinstance(indexer, np.ndarray | pd.Index): indexer = np.array(indexer) if len(indexer) == 0: indexer = indexer.astype(int) - if issubclass(indexer.dtype.type, (np.integer, np.floating)): + if issubclass(indexer.dtype.type, np.integer | np.floating): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 7117fb5a8..45f831af3 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -177,7 +177,7 @@ def equal_sparse(a, b) -> bool: xp = array_api_compat.array_namespace(a.data) - if isinstance(b, (CupySparseMatrix, sparse.spmatrix, SpArray)): + if isinstance(b, CupySparseMatrix | sparse.spmatrix | SpArray): if isinstance(a, CupySparseMatrix): # Comparison broken for CSC matrices # https://github.com/cupy/cupy/issues/7757 @@ -209,7 +209,7 @@ def equal_awkward(a, b) -> bool: def as_sparse(x, use_sparse_array=False): - if not isinstance(x, (sparse.spmatrix, SpArray)): + if not isinstance(x, sparse.spmatrix | SpArray): if CAN_USE_SPARSE_ARRAY and use_sparse_array: return sparse.csr_array(x) return sparse.csr_matrix(x) @@ -541,7 +541,7 @@ def apply(self, el, *, axis, fill_value=None): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) - elif isinstance(el, (sparse.spmatrix, SpArray, CupySparseMatrix)): + elif isinstance(el, sparse.spmatrix | SpArray | CupySparseMatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) elif isinstance(el, AwkArray): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) @@ -727,10 +727,10 @@ def default_fill_value(els): This is largely due to backwards compat, and might not be the ideal solution. """ if any( - isinstance(el, (sparse.spmatrix, SpArray)) + isinstance(el, sparse.spmatrix | SpArray) or ( isinstance(el, DaskArray) - and isinstance(el._meta, (sparse.spmatrix, SpArray)) + and isinstance(el._meta, sparse.spmatrix | SpArray) ) for el in els ): @@ -812,7 +812,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): import cupyx.scipy.sparse as cpsparse if not all( - isinstance(a, (CupySparseMatrix, CupyArray)) or 0 in a.shape for a in arrays + isinstance(a, CupySparseMatrix | CupyArray) or 0 in a.shape for a in arrays ): raise NotImplementedError( "Cannot concatenate a cupy array with other array types." @@ -839,7 +839,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], axis=axis, ) - elif any(isinstance(a, (sparse.spmatrix, SpArray)) for a in arrays): + elif any(isinstance(a, sparse.spmatrix | SpArray) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] use_sparse_array = any(issubclass(type(a), SpArray) for a in arrays) return sparse_stack( @@ -998,7 +998,7 @@ def concat_pairwise_mapping( els = [ m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] - if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els): + if all(isinstance(el, CupySparseMatrix | CupyArray) for el in els): result[k] = _cp_block_diag(els, format="csr") elif all(isinstance(el, DaskArray) for el in els): result[k] = _dask_block_diag(els) diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 7237c06b4..d138440b5 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -40,7 +40,7 @@ def __init__( # construct manually if adata.isbacked == (X is None): # Move from GPU to CPU since it's large and not always used - if isinstance(X, (CupyArray, CupySparseMatrix)): + if isinstance(X, CupyArray | CupySparseMatrix): self._X = X.get() else: self._X = X @@ -51,7 +51,7 @@ def __init__( self.varm = varm elif X is None: # construct from adata # Move from GPU to CPU since it's large and not always used - if isinstance(adata.X, (CupyArray, CupySparseMatrix)): + if isinstance(adata.X, CupyArray | CupySparseMatrix): self._X = adata.X.get() else: self._X = adata.X.copy() @@ -124,9 +124,9 @@ def __getitem__(self, index): oidx, vidx = self._normalize_indices(index) # To preserve two dimensional shape - if isinstance(vidx, (int, np.integer)): + if isinstance(vidx, int | np.integer): vidx = slice(vidx, vidx + 1, 1) - if isinstance(oidx, (int, np.integer)): + if isinstance(oidx, int | np.integer): oidx = slice(oidx, oidx + 1, 1) if not self._adata.isbacked: diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 91bc23cc7..ae6b47c7f 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -343,7 +343,7 @@ def _get_group_format(group: GroupStorageType) -> str: def is_sparse_indexing_overridden(format: Literal["csr", "csc"], row, col): major_indexer, minor_indexer = (row, col) if format == "csr" else (col, row) return isinstance(minor_indexer, slice) and ( - (isinstance(major_indexer, (int, np.integer))) + (isinstance(major_indexer, int | np.integer)) or (isinstance(major_indexer, slice)) or (isinstance(major_indexer, np.ndarray) and major_indexer.ndim == 1) ) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 36429403d..edf4977cc 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -82,14 +82,14 @@ def write_h5ad( f.attrs.setdefault("encoding-version", "0.1.0") if "X" in as_dense and isinstance( - adata.X, (sparse.spmatrix, BaseCompressedSparseDataset) + adata.X, sparse.spmatrix | BaseCompressedSparseDataset ): write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs) elif not (adata.isbacked and Path(adata.filename) == Path(filepath)): # If adata.isbacked, X should already be up to date write_elem(f, "X", adata.X, dataset_kwargs=dataset_kwargs) if "raw/X" in as_dense and isinstance( - adata.raw.X, (sparse.spmatrix, BaseCompressedSparseDataset) + adata.raw.X, sparse.spmatrix | BaseCompressedSparseDataset ): write_sparse_as_dense( f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs diff --git a/src/anndata/_io/read.py b/src/anndata/_io/read.py index fd886f370..f22cff351 100644 --- a/src/anndata/_io/read.py +++ b/src/anndata/_io/read.py @@ -337,7 +337,7 @@ def read_text( dtype Numpy data type. """ - if not isinstance(filename, (PathLike, str, bytes)): + if not isinstance(filename, PathLike | str | bytes): return _read_text(filename, delimiter, first_column_names, dtype) filename = Path(filename) diff --git a/src/anndata/_io/utils.py b/src/anndata/_io/utils.py index ee7aa23d0..f8bdb01c7 100644 --- a/src/anndata/_io/utils.py +++ b/src/anndata/_io/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +from itertools import pairwise from typing import TYPE_CHECKING, cast from warnings import warn @@ -8,16 +9,16 @@ from packaging.version import Version from .._core.sparse_dataset import BaseCompressedSparseDataset -from ..compat import add_note, pairwise +from ..compat import add_note if TYPE_CHECKING: from collections.abc import Callable - from typing import Literal, Union + from typing import Literal from .._types import StorageType from ..compat import H5Group, ZarrGroup - Storage = Union[StorageType, BaseCompressedSparseDataset] + Storage = StorageType | BaseCompressedSparseDataset # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 6d9a1e9bf..d81f6aaf4 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Protocol, TypeVar from .compat import ( H5Array, @@ -32,9 +32,9 @@ "StorageType", ] -ArrayStorageType: TypeAlias = Union[ZarrArray, H5Array] -GroupStorageType: TypeAlias = Union[ZarrGroup, H5Group] -StorageType: TypeAlias = Union[ArrayStorageType, GroupStorageType] +ArrayStorageType: TypeAlias = ZarrArray | H5Array +GroupStorageType: TypeAlias = ZarrGroup | H5Group +StorageType: TypeAlias = ArrayStorageType | GroupStorageType # NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantRWAble = TypeVar("ContravariantRWAble", bound=RWAble, contravariant=True) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 78bd484db..93c86141a 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -10,7 +10,7 @@ from importlib.util import find_spec from inspect import Parameter, signature from pathlib import Path -from typing import TYPE_CHECKING, TypeVar, Union +from typing import TYPE_CHECKING, TypeVar from warnings import warn import h5py @@ -46,8 +46,8 @@ class Empty: pass -Index1D = Union[slice, int, str, np.int64, np.ndarray] -Index = Union[Index1D, tuple[Index1D, Index1D], scipy.sparse.spmatrix, SpArray] +Index1D = slice | int | str | np.int64 | np.ndarray +Index = Index1D | tuple[Index1D, Index1D] | scipy.sparse.spmatrix | SpArray H5Group = h5py.Group H5Array = h5py.Dataset H5File = h5py.File @@ -75,18 +75,6 @@ def __exit__(self, *_exc_info) -> None: os.chdir(self._old_cwd.pop()) -if sys.version_info >= (3, 10): - from itertools import pairwise -else: - - def pairwise(iterable): - from itertools import tee - - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - ############################# # Optional deps ############################# @@ -288,7 +276,7 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: return value.astype(new_dtype) -Group_T = TypeVar("Group_T", bound=Union[ZarrGroup, h5py.Group]) +Group_T = TypeVar("Group_T", bound=ZarrGroup | h5py.Group) # TODO: This is a workaround for https://github.com/scverse/anndata/issues/874 @@ -319,7 +307,7 @@ def _clean_uns(adata: AnnData): # noqa: F821 continue name = cats_name.replace("_categories", "") # fix categories with a single category - if isinstance(cats, (str, int)): + if isinstance(cats, str | int): cats = [cats] for ann in [adata.obs, adata.var]: if name not in ann: @@ -344,7 +332,7 @@ def _move_adj_mtx(d): for k in ("distances", "connectivities"): if ( (k in n) - and isinstance(n[k], (scipy.sparse.spmatrix, np.ndarray)) + and isinstance(n[k], scipy.sparse.spmatrix | np.ndarray) and len(n[k].shape) == 2 ): warn( diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 9690420ec..21a678e2c 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -352,7 +352,7 @@ def _write_concat_sequence( ) write_elem(output_group, output_path, df) elif all( - isinstance(a, (pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray)) + isinstance(a, pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray) for a in arrays ): _write_concat_arrays( diff --git a/src/anndata/experimental/multi_files/_anncollection.py b/src/anndata/experimental/multi_files/_anncollection.py index 31b27c879..c5f427f6d 100644 --- a/src/anndata/experimental/multi_files/_anncollection.py +++ b/src/anndata/experimental/multi_files/_anncollection.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Callable, Mapping from functools import reduce -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -584,7 +584,7 @@ def attrs_keys(self): DictCallable = dict[str, Callable] -ConvertType = Union[Callable, dict[str, Union[Callable, DictCallable]]] +ConvertType = Callable | dict[str, Callable | DictCallable] class AnnCollection(_ConcatViewMixin, _IterateViewMixin): diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 2ae0fdd17..5a8df6460 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -37,8 +37,8 @@ from anndata.utils import asarray if TYPE_CHECKING: - from collections.abc import Collection - from typing import Callable, Literal, TypeGuard, TypeVar + from collections.abc import Callable, Collection + from typing import Literal, TypeGuard, TypeVar DT = TypeVar("DT") diff --git a/src/anndata/typing.py b/src/anndata/typing.py index b3a5517f4..d13927bad 100644 --- a/src/anndata/typing.py +++ b/src/anndata/typing.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -32,38 +32,35 @@ """1D or 2D index an :class:`~anndata.AnnData` object can be sliced with.""" -ArrayDataStructureType: TypeAlias = Union[ - np.ndarray, - ma.MaskedArray, - sparse.csr_matrix, - sparse.csc_matrix, - SpArray, - AwkArray, - H5Array, - ZarrArray, - ZappyArray, - abc.CSRDataset, - abc.CSCDataset, - DaskArray, - CupyArray, - CupySparseMatrix, -] +ArrayDataStructureType: TypeAlias = ( + np.ndarray + | ma.MaskedArray + | sparse.csr_matrix + | sparse.csc_matrix + | SpArray + | AwkArray + | H5Array + | ZarrArray + | ZappyArray + | abc.CSRDataset + | abc.CSCDataset + | DaskArray + | CupyArray + | CupySparseMatrix +) -InMemoryArrayOrScalarType: TypeAlias = Union[ - pd.DataFrame, np.number, str, ArrayDataStructureType -] +InMemoryArrayOrScalarType: TypeAlias = ( + pd.DataFrame | np.number | str | ArrayDataStructureType +) -AxisStorable: TypeAlias = Union[ - InMemoryArrayOrScalarType, dict[str, "AxisStorable"], list["AxisStorable"] -] +AxisStorable: TypeAlias = ( + InMemoryArrayOrScalarType | dict[str, "AxisStorable"] | list["AxisStorable"] +) """A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" -RWAble: TypeAlias = Union[ - AxisStorable, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] +RWAble: TypeAlias = ( + AxisStorable | AnnData | pd.Categorical | pd.api.extensions.ExtensionArray +) """A superset of :type:`anndata.typing.AxisStorable` (i.e., including :class:`anndata.AnnData`) which is everything can be read/written by :func:`anndata.io.read_elem` and :func:`anndata.io.write_elem`.""" diff --git a/tests/test_backed_hdf5.py b/tests/test_backed_hdf5.py index 6cb449e28..19b4ca44d 100644 --- a/tests/test_backed_hdf5.py +++ b/tests/test_backed_hdf5.py @@ -200,8 +200,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray - and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) - and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) + and isinstance(obs_idx, list | np.ndarray | sparse.spmatrix | SpArray) + and isinstance(var_idx, list | np.ndarray | sparse.spmatrix | SpArray) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index f1f585e5c..7d83c73fc 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -497,19 +497,19 @@ def get_obs_els(adata): adata1.obsm = { k: v for k, v in adata1.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } # remove AwkArrays from adata.var, as outer joins are not yet implemented for them for tmp_ad in [adata1, adata2, adata3]: diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 79bf729e4..0bbbf285a 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -96,7 +96,7 @@ def set_copy(d, **kwargs): # TODO: Should the passed path be absolute? path = "/" + store.path + "/" + k if hasattr(elem, "shape") and not isinstance( - elem, (sparse.spmatrix, SpArray, ad.AnnData) + elem, sparse.spmatrix | SpArray | ad.AnnData ): if re.match(r"^/((X)|(layers)).*", path): chunks = (M, N) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 67921da0e..1da55a0cd 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -163,16 +163,16 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa if isinstance(adata_src.raw.X, SpArray): assert isinstance(adata.raw.X, sparse.spmatrix) else: - assert isinstance(adata_src.raw.X, (type(adata.raw.X), DaskArray)) + assert isinstance(adata_src.raw.X, type(adata.raw.X) | DaskArray) assert isinstance( - adata_src.uns["uns4"]["c"], (type(adata.uns["uns4"]["c"]), DaskArray) + adata_src.uns["uns4"]["c"], type(adata.uns["uns4"]["c"]) | DaskArray ) - assert isinstance(adata_src.varm, (type(adata.varm), DaskArray)) + assert isinstance(adata_src.varm, type(adata.varm) | DaskArray) assert_equal(adata.raw.X, adata_src.raw.X) pd.testing.assert_frame_equal(adata.raw.var, adata_src.raw.var) - assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) - assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) + assert isinstance(adata.uns["uns4"]["a"], int | np.integer) + assert isinstance(adata_src.uns["uns4"]["a"], int | np.integer) assert_equal(adata, adata_src) From fe77a5c35851696465f99149c3e96c493e59a84c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:00:36 +0200 Subject: [PATCH 295/348] (chore): add awkward `nitpick_ignore` comment --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index 4919036ab..865dad0d4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -108,6 +108,7 @@ ("py:class", "numpy.ma.core.MaskedArray"), ("py:class", "dask.array.core.Array"), ("py:class", "awkward.highlevel.Array"), + # https://github.com/sphinx-doc/sphinx/issues/10591 ("py:class", "awkward.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), From a654421975d0a2b8ad78fbc856309fa959042a10 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:27:05 +0200 Subject: [PATCH 296/348] (refactor): use generator for new datasets --- src/anndata/_core/merge.py | 58 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 45f831af3..aa6f56bd0 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -35,7 +35,7 @@ from .index import _subset, make_slice if TYPE_CHECKING: - from collections.abc import Collection, Iterable, Sequence + from collections.abc import Collection, Generator, Iterable, Sequence from typing import Any from pandas.api.extensions import ExtensionDtype @@ -1097,28 +1097,36 @@ def get_chunk(block_info=None): def make_xarray_extension_dtypes_dask( annotations: Iterable[Dataset2D], use_only_object_dtype: bool = False -): - new_annotations = [] +) -> Generator[Dataset2D, None, None]: + """ + Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns. + Parameters + ---------- + annotations + The datasets to be altered + use_only_object_dtype, optional + Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False + + Yields + ------ + An altered dataset. + """ for a in annotations: - extension_cols = [] - for col in a.columns: - if pd.api.types.is_extension_array_dtype(a[col]): - extension_cols += [col] - new_annotations += [ - a.copy( - data={ - **{ - col: make_dask_col_from_extension_dtype( - a[col], use_only_object_dtype - ) - for col in extension_cols - }, - **{col: a[col] for col in a.columns if col not in extension_cols}, - } - ) - ] - return new_annotations + extension_cols = set( + filter(lambda col: pd.api.types.is_extension_array_dtype(a[col]), a.columns) + ) + + yield a.copy( + data={ + name: ( + make_dask_col_from_extension_dtype(col, use_only_object_dtype) + if name in extension_cols + else col + ) + for name, col in a.items() + } + ) def get_attrs(annotations: Iterable[Dataset2D]) -> dict: @@ -1143,7 +1151,7 @@ def concat_dataset2d_on_annot_axis( from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._compat import xarray as xr - annotations_with_only_dask = make_xarray_extension_dtypes_dask(annotations) + annotations_with_only_dask = list(make_xarray_extension_dtypes_dask(annotations)) attrs = get_attrs(annotations_with_only_dask) index_name = np.unique([a.index.name for a in annotations])[0] return Dataset2D( @@ -1448,8 +1456,10 @@ def concat( # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be # the only "lazy" setting so I'm not sure we really want that. # Because of xarray's merge upcasting, it's safest to simply assume that all dtypes are objects. - annotations_with_only_dask = make_xarray_extension_dtypes_dask( - alt_annotations, use_only_object_dtype=True + annotations_with_only_dask = list( + make_xarray_extension_dtypes_dask( + alt_annotations, use_only_object_dtype=True + ) ) attrs = get_attrs(annotations_with_only_dask) alt_annot = Dataset2D( From 015bdcaf1682141a49a3767912dc0f1330c3c12e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:27:34 +0200 Subject: [PATCH 297/348] (chore): docs + types in `merge.py` --- src/anndata/_core/merge.py | 44 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index aa6f56bd0..bc41346b7 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -40,7 +40,7 @@ from pandas.api.extensions import ExtensionDtype - from anndata.experimental.backed._compat import Dataset2D + from anndata.experimental.backed._compat import DataArray, Dataset2D Join_T = Literal["inner", "outer"] @@ -1071,7 +1071,23 @@ def concat_Xs(adatas, reindexers, axis, fill_value): return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) -def make_dask_col_from_extension_dtype(col, use_only_object_dtype: bool = False): +def make_dask_col_from_extension_dtype( + col: DataArray, use_only_object_dtype: bool = False +) -> DaskArray: + """ + Creates dask arrays from :class:`pandas.api.extensions.ExtensionArray` dtype :class:`xarray.DataArray`s. + + Parameters + ---------- + col + The columns to be converted + use_only_object_dtype, optional + Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False + + Returns + ------- + A :class:`dask.Array`: representation of the column. + """ import dask.array as da from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size @@ -1130,6 +1146,17 @@ def make_xarray_extension_dtypes_dask( def get_attrs(annotations: Iterable[Dataset2D]) -> dict: + """Generate the `attrs` from `annotations`. + + Parameters + ---------- + annotations + The datasets with `attrs`. + + Returns + ------- + `attrs`. + """ index_names = np.unique([a.index.name for a in annotations]) assert len(index_names) == 1, "All annotations must have the same index name." if any(a.index.dtype == "int64" for a in annotations): @@ -1148,6 +1175,19 @@ def concat_dataset2d_on_annot_axis( annotations: Iterable[Dataset2D], join: Join_T, ): + """Create a concatenate dataset from a list of :class:`~anndata.experimental.backed._xarray.Dataset2D` objects. + + Parameters + ---------- + annotations + The :class:`~anndata.experimental.backed._xarray.Dataset2D` objects to be concatenated. + join + Type of join operation + + Returns + ------- + Concatenated :class:`~anndata.experimental.backed._xarray.Dataset2D` + """ from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._compat import xarray as xr From 7201bad7683d9d8ab36e407490356217345e15ee Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:28:06 +0200 Subject: [PATCH 298/348] (recactor): use set for `index_name` in `merge.py` --- src/anndata/_core/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index bc41346b7..959f91e89 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1194,6 +1194,7 @@ def concat_dataset2d_on_annot_axis( annotations_with_only_dask = list(make_xarray_extension_dtypes_dask(annotations)) attrs = get_attrs(annotations_with_only_dask) index_name = np.unique([a.index.name for a in annotations])[0] + [index_name] = {a.index.name for a in annotations} return Dataset2D( xr.concat(annotations_with_only_dask, join=join, dim=index_name), attrs=attrs ) From 5d813c0e77ecb6b521cadf1702023032acc5d101 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:48:44 +0200 Subject: [PATCH 299/348] (refactor): comprehension for `{alt_}annotations_in_memory` --- src/anndata/_core/merge.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 959f91e89..e4af0cf6f 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1456,18 +1456,12 @@ def concat( are_any_annotations_dataframes = any( isinstance(a, pd.DataFrame) for a in annotations ) - are_annotations_mixed_type = are_any_annotations_dataframes and any( - isinstance(a, Dataset2D) for a in annotations - ) if are_any_annotations_dataframes: - annotations_in_memory = annotations.copy() - if are_annotations_mixed_type: - for i, a in enumerate(annotations): - annotations_in_memory[i] = ( - a.to_pandas() if isinstance(a, Dataset2D) else a - ) + annotations_in_memory = ( + a.to_pandas() if isinstance(a, Dataset2D) else a for a in annotations + ) concat_annot = pd.concat( - unify_dtypes(a for a in annotations_in_memory), + unify_dtypes(annotations_in_memory), join=join, ignore_index=True, ) @@ -1482,17 +1476,11 @@ def concat( are_any_alt_annotations_dataframes = any( isinstance(a, pd.DataFrame) for a in alt_annotations ) - are_alt_annotations_mixed_type = are_any_alt_annotations_dataframes and any( - isinstance(a, Dataset2D) for a in alt_annotations - ) if are_any_alt_annotations_dataframes: - alt_annotations_in_memory = alt_annotations.copy() - if are_alt_annotations_mixed_type: - for i, a in enumerate(alt_annotations): - alt_annotations_in_memory[i] = ( - a.to_pandas() if isinstance(a, Dataset2D) else a - ) - alt_annot = merge_dataframes(alt_annotations, alt_indices, merge) + alt_annotations_in_memory = [ + a.to_pandas() if isinstance(a, Dataset2D) else a for a in alt_annotations + ] + alt_annot = merge_dataframes(alt_annotations_in_memory, alt_indices, merge) else: # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be # the only "lazy" setting so I'm not sure we really want that. From b50e8ad89ae675fad9ad702e5a8c18b102688eb9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 15:51:01 +0200 Subject: [PATCH 300/348] (chore): types in `lazy_methods.py` --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f08ecc538..b6d00fcc8 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -18,7 +18,7 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable, Generator, Iterator, Mapping, Sequence + from collections.abc import Callable, Generator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar from anndata.experimental.backed._compat import DataArray, Dataset2D @@ -194,7 +194,7 @@ def _gen_xarray_dict_iterator_from_elems( index_label: str, index_key: str, index: np.NDArray, -) -> Iterator[tuple[str, DataArray]]: +) -> Generator[tuple[str, DataArray]]: from anndata.experimental.backed._compat import DataArray from anndata.experimental.backed._compat import xarray as xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray @@ -203,7 +203,7 @@ def _gen_xarray_dict_iterator_from_elems( data_array_name = k if isinstance(v, DaskArray) and k != index_key: data_array = DataArray(v, coords=[index], dims=[index_label], name=k) - elif isinstance(v, (CategoricalArray, MaskedArray)) and k != index_key: + elif isinstance(v, CategoricalArray | MaskedArray) and k != index_key: variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=[index_label] ) From 3ca669cf32eee60e156b3d9e00154f0ca7ed9ce5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:01:52 +0200 Subject: [PATCH 301/348] (chore): `lazy_methods.py` index handling made clearer --- src/anndata/_io/specs/lazy_methods.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b6d00fcc8..9a32f8ef6 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -243,12 +243,14 @@ def read_dataframe( for k in [*elem.attrs["column-order"], elem.attrs["_index"]] } elem_name = get_elem_name(elem) - # remove end for obsm/varm - obs_var_match = re.findall(r"(obs|var)", elem_name) - if not len(obs_var_match): + # Determine whether we can use label based indexing i.e., is the elem `obs` or `var` + obs_var_matches = re.findall(r"(obs|var)", elem_name) + if not len(obs_var_matches) == 1: label_based_indexing_key = "index" else: - label_based_indexing_key = f"{obs_var_match[0]}_names" + label_based_indexing_key = f"{obs_var_matches[0]}_names" + # If we are not using a range index, the underlying on disk label for the index + # could be different than {obs,var}_names - otherwise we use a dummy value. if not use_range_index: index_label = label_based_indexing_key index_key = elem.attrs["_index"] From 7ea20df47c35bf9582a70c95e89c1f41c2f7ed6a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:02:10 +0200 Subject: [PATCH 302/348] (chore): move comment --- src/anndata/_io/specs/lazy_methods.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 9a32f8ef6..e64881274 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -254,9 +254,8 @@ def read_dataframe( if not use_range_index: index_label = label_based_indexing_key index_key = elem.attrs["_index"] - index = elem_dict[ - index_key - ].compute() # no sense in reading this in multiple times + # no sense in reading this in multiple times + index = elem_dict[index_key].compute() else: index_label = DUMMY_RANGE_INDEX_KEY index_key = DUMMY_RANGE_INDEX_KEY From 91fdb909f05686fee795adb26c74ee52bfb6218e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:02:52 +0200 Subject: [PATCH 303/348] (chore): dedupe `read_params` usage --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a93ac61a9..315c7011d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -314,7 +314,7 @@ def read_elem( ) raise ValueError(msg) has_extra_args = True - if "chunks" in inspect.signature(read_func).parameters: + if "chunks" in read_params: has_extra_args = True kwargs["chunks"] = chunks if has_extra_args: From a71dad895359fe33c7a0782b73fb790f47f5bd00 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:03:41 +0200 Subject: [PATCH 304/348] (chore): `**kwargs` usage doesn't affect call when empty --- src/anndata/_io/specs/registry.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 315c7011d..d6c8d734d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -305,7 +305,6 @@ def read_elem( msg = "Dask reading does not use a callback. Ignoring callback." warnings.warn(msg, stacklevel=2) read_params = inspect.signature(read_func).parameters - has_extra_args = False for kwarg in kwargs: if kwarg not in read_params: msg = ( @@ -313,13 +312,9 @@ def read_elem( "registered read function." ) raise ValueError(msg) - has_extra_args = True if "chunks" in read_params: - has_extra_args = True kwargs["chunks"] = chunks - if has_extra_args: - return read_func(elem, **kwargs) - return read_func(elem) + return read_func(elem, **kwargs) class Writer: From 0ac61c6fcdeeb2dd33b7205f4933cd38858a514d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:13:41 +0200 Subject: [PATCH 305/348] (fix): clean up `_lazy_arrays.py` typing --- .../experimental/backed/_lazy_arrays.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index a99f55824..ce85f1056 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -14,6 +14,8 @@ from ._compat import xarray as xr if TYPE_CHECKING: + from typing import Literal + from anndata._core.index import Index @@ -41,10 +43,14 @@ def __getitem__(self, key): ) -class CategoricalArray(BackendArray): +class CategoricalArray(BackendArray, Generic[K]): + _codes: ZarrOrHDF5Wrapper[K] + _categories: ZarrArray | H5Array + shape: tuple[int, ...] + def __init__( self, - codes: ZarrArray | H5Array, + codes: K, categories: ZarrArray | H5Array, ordered: bool, *args, @@ -52,8 +58,7 @@ def __init__( ): self._categories = categories self._ordered = ordered - self._categories_cache = None - self._codes = ZarrOrHDF5Wrapper[type(codes)](codes) + self._codes = ZarrOrHDF5Wrapper(codes) self.shape = self._codes.shape @cached_property @@ -80,11 +85,16 @@ def dtype(self): return pd.CategoricalDtype(categories=self._categories, ordered=self._ordered) -class MaskedArray(BackendArray): +class MaskedArray(BackendArray, Generic[K]): + _mask: ZarrOrHDF5Wrapper[K] + _values: ZarrOrHDF5Wrapper[K] + _dtype_str: Literal["nullable-integer", "nullable-boolean"] + shape: tuple[int, ...] + def __init__( self, values: ZarrArray | H5Array, - dtype_str: str, + dtype_str: Literal["nullable-integer", "nullable-boolean"], mask: ZarrArray | H5Array | None = None, ): self._mask = ZarrOrHDF5Wrapper(mask) @@ -98,13 +108,12 @@ def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: mask = self._mask[key] if self._dtype_str == "nullable-integer": # numpy does not support nan ints - return xr.core.extension_array.PandasExtensionArray( - pd.arrays.IntegerArray(values, mask=mask) - ) + extension_array = pd.arrays.IntegerArray(values, mask=mask) elif self._dtype_str == "nullable-boolean": - return xr.core.extension_array.PandasExtensionArray( - pd.arrays.BooleanArray(values, mask=mask) - ) + extension_array = pd.arrays.BooleanArray(values, mask=mask) + else: + raise ValueError(f"Invalid dtype_str {self._dtype_str}") + return xr.core.extension_array.PandasExtensionArray(extension_array) return xr.core.extension_array.PandasExtensionArray(pd.array(values)) @cached_property @@ -116,6 +125,8 @@ def dtype(self): ).dtype elif self._dtype_str == "nullable-boolean": return pd.BooleanDtype() + else: + raise ValueError(f"Invalid dtype_str {self._dtype_str}") @_subset.register(DataArray) From af5c2feed6762c4b6a4e2c42ae78d644200cb566 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 16:13:52 +0200 Subject: [PATCH 306/348] (fix): no assert, raise ValueError --- src/anndata/experimental/backed/_xarray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 4aa9adbdb..b13d6ef87 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -27,9 +27,9 @@ def __repr__(self) -> str: def get_index_dim(ds: xr.DataArray) -> Hashable: - assert ( - len(ds.sizes) == 1 - ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" + if len(ds.sizes) != 1: + msg = f"xarray Dataset should not have more than 1 dims, found {len(ds)}" + raise ValueError(msg) return list(ds.indexes.keys())[0] From edac279bdbce23d28156a562a26abe7362ebec31 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:06:47 +0200 Subject: [PATCH 307/348] (fix): use `get` instead of membership check + no in-place --- src/anndata/experimental/backed/_xarray.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index b13d6ef87..e7394894a 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -128,9 +128,8 @@ def _remove_unused_categories_xr( @to_memory.register(Dataset2D) def to_memory(ds: Dataset2D, copy=False): df = ds.to_dataframe() - if "indexing_key" in ds.attrs: - index_key = ds.attrs["indexing_key"] - if df.index.name != index_key: - df.set_index(index_key, inplace=True) + index_key = ds.attrs.get("indexing_key", None) + if df.index.name != index_key and index_key is not None: + df = df.set_index(index_key) df.index.name = None # matches old AnnData object return df From cdd9b89ad5b8d6c6ad2bead2cf0224086fed0abc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:08:25 +0200 Subject: [PATCH 308/348] (chore): pytest mark to beginning + `diskfmt` -> `diskfmt` + thread safety using `worker_id` --- tests/test_read_lazy.py | 79 +++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index bb787a7c4..da2dcd7aa 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -22,7 +22,12 @@ ) if TYPE_CHECKING: - from typing import Callable, Literal + from pathlib import Path + from typing import Literal + +pytestmark = pytest.mark.skipif( + not find_spec("xarray"), reason="Xarray is not installed" +) @pytest.fixture( @@ -35,7 +40,7 @@ def mtx_format(request): @pytest.fixture(params=["zarr", "h5ad"], scope="session") -def dskfmt(request): +def diskfmt(request): return request.param @@ -52,27 +57,38 @@ def join(request): # TODO: why does `read_lazy().to_memory()` cause `Dataset2D.to_memory()` to lose index name in # multi-threaded tests when only opened once i.e., without this Callable? @pytest.fixture(scope="session") -def adata_remote_orig( - tmp_path_factory, dskfmt: str, mtx_format, load_annotation_index: bool -) -> tuple[Callable[[], AnnData], AnnData]: +def adata_remote_orig_with_path( + tmp_path_factory, + diskfmt: str, + mtx_format, + load_annotation_index: bool, + worker_id: str = "serial", +) -> tuple[AnnData, AnnData]: """Create remote fixtures, one without a range index and the other with""" - if dskfmt == "h5ad": - orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / f"orig.{dskfmt}" + file_name = f"orig_{worker_id}.{diskfmt}" + if diskfmt == "h5ad": + orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / file_name else: - orig_path = tmp_path_factory.mktemp(f"orig.{dskfmt}") + orig_path = tmp_path_factory.mktemp(file_name) orig = gen_adata((1000, 1100), mtx_format) - orig.raw = gen_adata((1000, 1100), mtx_format) - getattr(orig, f"write_{dskfmt}")(orig_path) - return lambda: read_lazy( - orig_path, load_annotation_index=load_annotation_index - ), orig + orig.raw = orig.copy() + getattr(orig, f"write_{diskfmt}")(orig_path) + return orig_path, orig @pytest.fixture -def adata_remote_with_store_tall_skinny( - tmp_path_factory, mtx_format -) -> tuple[Callable[[], AnnData], AccessTrackingStore]: - orig_path = tmp_path_factory.mktemp("orig.zarr") +def adata_remote_orig(adata_remote_orig_with_path, load_annotation_index): + orig_path, orig = adata_remote_orig_with_path + return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig + + +@pytest.fixture(scope="session") +def adata_remote_with_store_tall_skinny_path( + tmp_path_factory, + mtx_format, + worker_id: str = "serial", +) -> Path: + orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}.zarr") M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) @@ -86,18 +102,18 @@ def adata_remote_with_store_tall_skinny( ) orig.raw = orig.copy() orig.write_zarr(orig_path) - store = AccessTrackingStore(orig_path) - return lambda: read_lazy(store), store + return orig_path -pytestmark = pytest.mark.skipif( - not find_spec("xarray"), reason="Xarray is not installed" -) +@pytest.fixture +def adata_remote_with_store_tall_skinny(adata_remote_with_store_tall_skinny_path): + store = AccessTrackingStore(adata_remote_with_store_tall_skinny_path) + remote = read_lazy(store) + return remote, store def test_access_count_obs_var(adata_remote_with_store_tall_skinny): - remote_generator, store = adata_remote_with_store_tall_skinny - remote = remote_generator() + remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers( ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X", "raw"] ) @@ -144,8 +160,7 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): def test_access_count_dtype(adata_remote_with_store_tall_skinny): - remote_generator, store = adata_remote_with_store_tall_skinny - remote = remote_generator() + remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/cat/categories"]) store.assert_access_count("obs/cat/categories", 0) # This should only cause categories to be read in once @@ -156,16 +171,14 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): def test_to_memory(adata_remote_orig): - remote_generator, orig = adata_remote_orig - remote = remote_generator() + remote, orig = adata_remote_orig assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) def test_view_to_memory(adata_remote_orig): - remote_generator, orig = adata_remote_orig - remote = remote_generator() + remote, orig = adata_remote_orig subset_obs = orig.obs["obs_cat"] == "a" assert_equal(orig[subset_obs, :], remote[subset_obs, :].to_memory()) @@ -174,8 +187,7 @@ def test_view_to_memory(adata_remote_orig): def test_view_of_view_to_memory(adata_remote_orig): - remote_generator, orig = adata_remote_orig - remote = remote_generator() + remote, orig = adata_remote_orig subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") subsetted_adata = orig[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" @@ -335,8 +347,7 @@ def test_concat_access_count( def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): from anndata.experimental.backed._compat import Dataset2D - remote_generator, orig = adata_remote_orig - remote = remote_generator() + remote, orig = adata_remote_orig @contextmanager def empty_context(): From e07426aef96b786809bcae18d61ce1a7c9013655 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:14:29 +0200 Subject: [PATCH 309/348] (fix): remove resetting key trackers --- tests/test_read_lazy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index da2dcd7aa..6dc594356 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -150,7 +150,6 @@ def test_access_count_obs_var(adata_remote_with_store_tall_skinny): def test_access_count_index(adata_remote_with_store_tall_skinny): _, store = adata_remote_with_store_tall_skinny - store.reset_key_trackers() store.initialize_key_trackers(["obs/_index"]) read_lazy(store, load_annotation_index=False) store.assert_access_count("obs/_index", 0) From 8278e0f71ed2a5602346b833a60a69a96a222b90 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:15:29 +0200 Subject: [PATCH 310/348] (fix): adata only has 4 cchunks in test, udpate comment --- tests/test_read_lazy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 6dc594356..74b2ff3fe 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -154,7 +154,7 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): read_lazy(store, load_annotation_index=False) store.assert_access_count("obs/_index", 0) read_lazy(store) - # 8 is number of chunks + # 4 is number of chunks store.assert_access_count("obs/_index", 4) From 30d1bb12a310a8406ca76409c9ed8cc4370229fa Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:48:06 +0200 Subject: [PATCH 311/348] (chore): better use arrange-act-assert --- src/anndata/experimental/backed/_io.py | 33 ++++++----- tests/test_read_lazy.py | 78 ++++++++++++++++---------- 2 files changed, 66 insertions(+), 45 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 26619fdfc..e623f9818 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -22,6 +22,19 @@ from ...compat import ZarrGroup +ANNDATA_ELEMS = [ + "obs", + "var", + "obsm", + "varm", + "obsp", + "varp", + "layers", + "X", + "raw", + "uns", +] + def read_lazy( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, @@ -97,9 +110,9 @@ def read_lazy( raise ImportError( "xarray is required to use the `read_lazy` function. Please install xarray." ) - is_h5_store = isinstance(store, (h5py.Dataset, h5py.File)) + is_h5_store = isinstance(store, h5py.Dataset | h5py.File) is_h5 = ( - isinstance(store, (Path, str)) and Path(store).suffix == ".h5ad" + isinstance(store, Path | str) and Path(store).suffix == ".h5ad" ) or is_h5_store has_keys = True # true if consolidated or h5ad @@ -121,20 +134,10 @@ def read_lazy( def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec): if iospec.encoding_type in {"anndata", "raw"} or elem_name.endswith("/"): - cols = [ - "obs", - "var", - "obsm", - "varm", - "obsp", - "varp", - "layers", - "X", - "raw", - "uns", - ] iter_object = ( - elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] + elem.items() + if has_keys + else [(k, elem[k]) for k in ANNDATA_ELEMS if k in elem] ) return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) elif ( diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 74b2ff3fe..8e814cb0b 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -13,6 +13,7 @@ from anndata import AnnData from anndata.compat import DaskArray from anndata.experimental import read_lazy +from anndata.experimental.backed._io import ANNDATA_ELEMS from anndata.tests.helpers import ( AccessTrackingStore, as_dense_dask_array, @@ -112,40 +113,53 @@ def adata_remote_with_store_tall_skinny(adata_remote_with_store_tall_skinny_path return remote, store -def test_access_count_obs_var(adata_remote_with_store_tall_skinny): +@pytest.mark.parametrize( + ("elem_key", "sub_key"), + [ + ("raw", "X"), + ("obs", "cat"), + ("obs", "int64"), + *((elem_name, None) for elem_name in ANNDATA_ELEMS), + ], +) +@pytest.mark.parametrize( + ("subset_func"), + [ + pytest.param(lambda x: x, id="full"), + pytest.param(lambda x: x[0:10, :], id="subset"), + ], +) +def test_access_count_elem_access( + adata_remote_with_store_tall_skinny, elem_key, sub_key, subset_func +): remote, store = adata_remote_with_store_tall_skinny - store.initialize_key_trackers( - ["obs/cat/codes", "obs/cat/categories", "obs/int64", "var/int64", "X", "raw"] - ) + full_path = f"{elem_key}/{sub_key}" if sub_key is not None else elem_key + store.initialize_key_trackers({full_path, "X"}) # a series of methods that should __not__ read in any data - remote.X # the initial (non-subset) access to `X` should not read in data - remote.shape - remote.var - remote.obs - remote.raw - remote.raw.var - remote.raw.X - remote.obs["int64"] - remote.obs["int64"] - remote.obs["cat"] - store.assert_access_count("obs/int64", 0) - store.assert_access_count("obs/cat/categories", 0) - subset = remote[remote.obs["cat"] == "a", :] - subset.obs["int64"] - sub_subset = subset[0:10, :] - sub_subset.obs["int64"] - sub_subset.var["int64"] + elem = getattr(subset_func(remote), elem_key) + if sub_key is not None: + getattr(elem, sub_key) + store.assert_access_count(full_path, 0) store.assert_access_count("X", 0) - store.assert_access_count("obs/int64", 0) - store.assert_access_count("var/int64", 0) - # all codes read in for subset (from 4 chunks) + + +def test_access_count_subset(adata_remote_with_store_tall_skinny): + remote, store = adata_remote_with_store_tall_skinny + non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS) + store.initialize_key_trackers(["obs/cat/codes", *non_obs_elem_names]) + remote[remote.obs["cat"] == "a", :] + # all codes read in for subset (from 1 chunk) store.assert_access_count("obs/cat/codes", 1) - # only one chunk needed for 0:10 subset - remote[0:10, :].obs["int64"].compute() + for elem_name in non_obs_elem_names: + store.assert_access_count(elem_name, 0) + + +def test_access_count_subset_column_compute(adata_remote_with_store_tall_skinny): + remote, store = adata_remote_with_store_tall_skinny + store.initialize_key_trackers(["obs/int64"]) + remote[remote.shape[0] // 2, :].obs["int64"].compute() + # two chunks needed for 0:10 subset store.assert_access_count("obs/int64", 1) - # .zmetadata handles .zarray so simple access does not cause any read - store.assert_access_count("var/int64", 0) - store.assert_access_count("raw", 0) def test_access_count_index(adata_remote_with_store_tall_skinny): @@ -169,9 +183,13 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): store.assert_access_count("obs/cat/categories", 1) +def test_uns_uses_dask(adata_remote_orig): + remote, _ = adata_remote_orig + assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) + + def test_to_memory(adata_remote_orig): remote, orig = adata_remote_orig - assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) From 509af7f372446411186b7a8b158c37673ccbe449 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:49:15 +0200 Subject: [PATCH 312/348] (chore): ids for boolean params --- tests/test_read_lazy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 8e814cb0b..35a803a17 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -248,7 +248,9 @@ def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFra @pytest.mark.parametrize("join", ["outer", "inner"]) -@pytest.mark.parametrize("are_vars_different", [True, False]) +@pytest.mark.parametrize( + "are_vars_different", [True, False], ids=["vars_different", "vars_same"] +) def test_concat_access_count( tmp_path, join: Literal["outer", "inner"], are_vars_different: bool ): From 58122a1de35d3754ad62f87d63490367c7c217e7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 17:50:38 +0200 Subject: [PATCH 313/348] (chore): contextlib + better assert objects --- tests/test_read_lazy.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 35a803a17..28b9fa7d9 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -1,6 +1,6 @@ from __future__ import annotations -from contextlib import contextmanager +from contextlib import nullcontext from importlib.util import find_spec from typing import TYPE_CHECKING @@ -368,14 +368,10 @@ def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation remote, orig = adata_remote_orig - @contextmanager - def empty_context(): - yield - maybe_warning_context = ( pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric") if not load_annotation_index - else empty_context() + else nullcontext() ) with maybe_warning_context: remote_concatenated = ad.concat([remote, remote], join=join) @@ -400,4 +396,7 @@ def empty_context(): ) assert_equal(corrected_remote_obs, corrected_memory_obs) assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) - assert all(in_memory_remote_concatenated.var_names == orig_concatenated.var_names) + assert ( + in_memory_remote_concatenated.var_names.tolist() + == orig_concatenated.var_names.tolist() + ) From e6fea741a83d507a90367c490de599508f70d718 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 19:06:45 +0200 Subject: [PATCH 314/348] (chore): refactor concatenation for arrange-act-assert --- src/anndata/_types.py | 16 +- src/anndata/experimental/backed/_io.py | 21 +- tests/test_read_lazy.py | 317 ++++++++++++++++++------- 3 files changed, 249 insertions(+), 105 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index d81f6aaf4..0e2b8f472 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol, TypeVar +from typing import TYPE_CHECKING, Literal, Protocol, TypeVar from .compat import ( H5Array, @@ -188,3 +188,17 @@ def __call__( Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... + + +ANNDATA_ELEMS = Literal[ + "obs", + "var", + "obsm", + "varm", + "obsp", + "varp", + "layers", + "X", + "raw", + "uns", +] diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index e623f9818..d11e3e9bd 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -1,14 +1,14 @@ from __future__ import annotations +import typing import warnings from pathlib import Path -from typing import ( - TYPE_CHECKING, -) +from typing import TYPE_CHECKING import h5py from anndata._io.specs.registry import read_elem_lazy +from anndata._types import ANNDATA_ELEMS from ..._core.anndata import AnnData from ..._settings import settings @@ -22,19 +22,6 @@ from ...compat import ZarrGroup -ANNDATA_ELEMS = [ - "obs", - "var", - "obsm", - "varm", - "obsp", - "varp", - "layers", - "X", - "raw", - "uns", -] - def read_lazy( store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, @@ -137,7 +124,7 @@ def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec iter_object = ( elem.items() if has_keys - else [(k, elem[k]) for k in ANNDATA_ELEMS if k in elem] + else [(k, elem[k]) for k in typing.get_args(ANNDATA_ELEMS) if k in elem] ) return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) elif ( diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 28b9fa7d9..ef016a48b 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -1,5 +1,6 @@ from __future__ import annotations +import typing from contextlib import nullcontext from importlib.util import find_spec from typing import TYPE_CHECKING @@ -11,9 +12,9 @@ import anndata as ad from anndata import AnnData +from anndata._types import ANNDATA_ELEMS from anndata.compat import DaskArray from anndata.experimental import read_lazy -from anndata.experimental.backed._io import ANNDATA_ELEMS from anndata.tests.helpers import ( AccessTrackingStore, as_dense_dask_array, @@ -23,13 +24,17 @@ ) if TYPE_CHECKING: + from collections.abc import Callable, Generator from pathlib import Path from typing import Literal + from numpy.typing import NDArray pytestmark = pytest.mark.skipif( not find_spec("xarray"), reason="Xarray is not installed" ) +ANNDATA_ELEMS_LIST = typing.get_args(ANNDATA_ELEMS) + @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], @@ -40,6 +45,13 @@ def mtx_format(request): return request.param +@pytest.fixture( + params=[True, False], ids=["vars_different", "vars_same"], scope="session" +) +def are_vars_different(request): + return request.param + + @pytest.fixture(params=["zarr", "h5ad"], scope="session") def diskfmt(request): return request.param @@ -55,6 +67,17 @@ def join(request): return request.param +@pytest.fixture( + params=[ + pytest.param(lambda x: x, id="full"), + pytest.param(lambda x: x[0:10, :], id="subset"), + ], + scope="session", +) +def simple_subset_func(request): + return request.param + + # TODO: why does `read_lazy().to_memory()` cause `Dataset2D.to_memory()` to lose index name in # multi-threaded tests when only opened once i.e., without this Callable? @pytest.fixture(scope="session") @@ -62,7 +85,6 @@ def adata_remote_orig_with_path( tmp_path_factory, diskfmt: str, mtx_format, - load_annotation_index: bool, worker_id: str = "serial", ) -> tuple[AnnData, AnnData]: """Create remote fixtures, one without a range index and the other with""" @@ -78,7 +100,9 @@ def adata_remote_orig_with_path( @pytest.fixture -def adata_remote_orig(adata_remote_orig_with_path, load_annotation_index): +def adata_remote_orig( + adata_remote_orig_with_path: tuple[Path, AnnData], load_annotation_index: bool +) -> tuple[AnnData, AnnData]: orig_path, orig = adata_remote_orig_with_path return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig @@ -106,46 +130,107 @@ def adata_remote_with_store_tall_skinny_path( return orig_path +@pytest.fixture(scope="session") +def adatas_paths_var_indices_for_concatenation( + tmp_path_factory, are_vars_different: bool, worker_id: str = "serial" +) -> tuple[list[AnnData], list[Path], list[pd.Index]]: + adatas = [] + var_indices = [] + paths = [] + M = 1000 + N = 50 + n_datasets = 3 + for dataset_index in range(n_datasets): + orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}_{dataset_index}.zarr") + paths.append(orig_path) + obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) + var_names = pd.Index( + f"gene_{i}{f'_{dataset_index}_ds' if are_vars_different and (i % 2) else ''}" + for i in range(N) + ) + var_indices.append(var_names) + obs = gen_typed_df(M, obs_names) + var = gen_typed_df(N, var_names) + orig = AnnData( + obs=obs, + var=var, + X=np.random.binomial(100, 0.005, (M, N)).astype(np.float32), + ) + orig.write_zarr(orig_path) + adatas.append(orig) + return adatas, paths, var_indices + + @pytest.fixture -def adata_remote_with_store_tall_skinny(adata_remote_with_store_tall_skinny_path): +def concatenation_objects( + adatas_paths_var_indices_for_concatenation, +) -> tuple[list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData]]: + adatas, paths, var_indices = adatas_paths_var_indices_for_concatenation + stores = [AccessTrackingStore(path) for path in paths] + lazys = [read_lazy(store) for store in stores] + return adatas, var_indices, stores, lazys + + +@pytest.fixture +def adata_remote_with_store_tall_skinny( + adata_remote_with_store_tall_skinny_path: Path, +) -> tuple[AnnData, AccessTrackingStore]: store = AccessTrackingStore(adata_remote_with_store_tall_skinny_path) remote = read_lazy(store) return remote, store +def get_key_trackers_for_columns_on_axis( + adata: AnnData, axis: Literal["obs", "var"] +) -> Generator[str, None, None]: + """Generate keys for tracking, using `codes` from categorical columns instead of the column name + + Parameters + ---------- + adata + Object to get keys from + axis + Axis to get keys from + + Yields + ------ + Keys for tracking + """ + for col in getattr(adata, axis).columns: + yield f"{axis}/{col}" if "cat" not in col else f"{axis}/{col}/codes" + + @pytest.mark.parametrize( ("elem_key", "sub_key"), [ ("raw", "X"), ("obs", "cat"), ("obs", "int64"), - *((elem_name, None) for elem_name in ANNDATA_ELEMS), - ], -) -@pytest.mark.parametrize( - ("subset_func"), - [ - pytest.param(lambda x: x, id="full"), - pytest.param(lambda x: x[0:10, :], id="subset"), + *((elem_name, None) for elem_name in ANNDATA_ELEMS_LIST), ], ) def test_access_count_elem_access( - adata_remote_with_store_tall_skinny, elem_key, sub_key, subset_func + adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + elem_key: ANNDATA_ELEMS, + sub_key: str, + simple_subset_func: Callable[[AnnData], AnnData], ): remote, store = adata_remote_with_store_tall_skinny full_path = f"{elem_key}/{sub_key}" if sub_key is not None else elem_key store.initialize_key_trackers({full_path, "X"}) # a series of methods that should __not__ read in any data - elem = getattr(subset_func(remote), elem_key) + elem = getattr(simple_subset_func(remote), elem_key) if sub_key is not None: getattr(elem, sub_key) store.assert_access_count(full_path, 0) store.assert_access_count("X", 0) -def test_access_count_subset(adata_remote_with_store_tall_skinny): +def test_access_count_subset( + adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], +): remote, store = adata_remote_with_store_tall_skinny - non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS) + non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS_LIST) store.initialize_key_trackers(["obs/cat/codes", *non_obs_elem_names]) remote[remote.obs["cat"] == "a", :] # all codes read in for subset (from 1 chunk) @@ -154,7 +239,9 @@ def test_access_count_subset(adata_remote_with_store_tall_skinny): store.assert_access_count(elem_name, 0) -def test_access_count_subset_column_compute(adata_remote_with_store_tall_skinny): +def test_access_count_subset_column_compute( + adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], +): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/int64"]) remote[remote.shape[0] // 2, :].obs["int64"].compute() @@ -162,7 +249,9 @@ def test_access_count_subset_column_compute(adata_remote_with_store_tall_skinny) store.assert_access_count("obs/int64", 1) -def test_access_count_index(adata_remote_with_store_tall_skinny): +def test_access_count_index( + adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], +): _, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/_index"]) read_lazy(store, load_annotation_index=False) @@ -172,7 +261,9 @@ def test_access_count_index(adata_remote_with_store_tall_skinny): store.assert_access_count("obs/_index", 4) -def test_access_count_dtype(adata_remote_with_store_tall_skinny): +def test_access_count_dtype( + adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], +): remote, store = adata_remote_with_store_tall_skinny store.initialize_key_trackers(["obs/cat/categories"]) store.assert_access_count("obs/cat/categories", 0) @@ -183,18 +274,18 @@ def test_access_count_dtype(adata_remote_with_store_tall_skinny): store.assert_access_count("obs/cat/categories", 1) -def test_uns_uses_dask(adata_remote_orig): +def test_uns_uses_dask(adata_remote_orig: tuple[AnnData, AnnData]): remote, _ = adata_remote_orig assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) -def test_to_memory(adata_remote_orig): +def test_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): remote, orig = adata_remote_orig remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, orig) -def test_view_to_memory(adata_remote_orig): +def test_view_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): remote, orig = adata_remote_orig subset_obs = orig.obs["obs_cat"] == "a" assert_equal(orig[subset_obs, :], remote[subset_obs, :].to_memory()) @@ -203,7 +294,7 @@ def test_view_to_memory(adata_remote_orig): assert_equal(orig[:, subset_var], remote[:, subset_var].to_memory()) -def test_view_of_view_to_memory(adata_remote_orig): +def test_view_of_view_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): remote, orig = adata_remote_orig subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") subsetted_adata = orig[subset_obs, :] @@ -224,7 +315,7 @@ def test_view_of_view_to_memory(adata_remote_orig): ) -def test_unconsolidated(tmp_path, mtx_format): +def test_unconsolidated(tmp_path: Path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) orig_pth = tmp_path / "orig.zarr" adata.write_zarr(orig_pth) @@ -247,69 +338,123 @@ def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFra return remote, memory +ANNDATA_ELEMS_LIST = typing.get_args(ANNDATA_ELEMS) + + @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize( - "are_vars_different", [True, False], ids=["vars_different", "vars_same"] + ("elem_key", "sub_key"), + [ + ("obs", "cat"), + ("obs", "int64"), + *((elem_name, None) for elem_name in ANNDATA_ELEMS_LIST), + ], ) def test_concat_access_count( - tmp_path, join: Literal["outer", "inner"], are_vars_different: bool + concatenation_objects: tuple[ + list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] + ], + join: Literal["outer", "inner"], + elem_key: ANNDATA_ELEMS, + sub_key: str, + simple_subset_func: Callable[[AnnData], AnnData], ): - lazy_adatas = [] - adatas = [] - stores: list[AccessTrackingStore] = [] - var_indices = [] - M = 1000 - N = 50 - n_datasets = 3 - for dataset_index in range(n_datasets): - orig_path = tmp_path / f"orig_{dataset_index}.zarr" - orig_path.mkdir() - obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) - var_names = pd.Index( - f"gene_{i}{f'_{dataset_index}_ds' if are_vars_different and (i % 2) else ''}" - for i in range(N) - ) - var_indices.append(var_names) - obs = gen_typed_df(M, obs_names) - var = gen_typed_df(N, var_names) - orig = AnnData( - obs=obs, - var=var, - X=np.random.binomial(100, 0.005, (M, N)).astype(np.float32), - ) - orig.write_zarr(orig_path) - store = AccessTrackingStore(orig_path) - store.initialize_key_trackers(["obs/int64", "X", "var/int64"]) - lazy_adatas += [read_lazy(store)] - adatas += [orig] - stores += [store] + adatas, _, stores, lazy_adatas = concatenation_objects + # track all elems except codes because they must be read in for concatenation + non_categorical_columns = ( + f"{elem}/{col}" if "cat" not in col else f"{elem}/{col}/codes" + for elem in ["obs", "var"] + for col in adatas[0].obs.columns + ) + non_obs_var_keys = filter(lambda e: e not in {"obs", "var"}, ANNDATA_ELEMS_LIST) + keys_to_track = [*non_categorical_columns, *non_obs_var_keys] + for store in stores: + store.initialize_key_trackers(keys_to_track) concated_remote = ad.concat(lazy_adatas, join=join) - for i in range(n_datasets): - stores[i].assert_access_count("obs/int64", 0) - stores[i].assert_access_count("X", 0) - stores[i].assert_access_count("var/int64", 0) - concatenated_memory = ad.concat(adatas, join=join) - # account for differences - - # name is lost normally, should fix + # a series of methods that should __not__ read in any data + elem = getattr(simple_subset_func(concated_remote), elem_key) + if sub_key is not None: + getattr(elem, sub_key) + for store in stores: + for elem in keys_to_track: + store.assert_access_count(elem, 0) + + +def test_concat_to_memory_obs_access_count( + concatenation_objects: tuple[ + list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] + ], + join: Literal["outer", "inner"], + simple_subset_func: Callable[[AnnData], AnnData], +): + adatas, _, stores, lazy_adatas = concatenation_objects + concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) + concated_remote_subset = simple_subset_func(concated_remote) + n_datasets = len(adatas) + obs_keys_to_track = get_key_trackers_for_columns_on_axis(adatas[0], "obs") + for store in stores: + store.initialize_key_trackers(obs_keys_to_track) + concated_remote_subset.to_memory() + # check access count for the stores - only the first should be accessed when reading into memory + for col in obs_keys_to_track: + stores[0].assert_access_count(col, 1) + for i in range(1, n_datasets): + # if the shapes are the same, data was read in to bring the object into memory; otherwise, not + stores[i].assert_access_count( + col, concated_remote_subset.shape[0] == concated_remote.shape[0] + ) + + +def test_concat_to_memory_obs( + concatenation_objects: tuple[ + list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] + ], + join: Literal["outer", "inner"], + simple_subset_func: Callable[[AnnData], AnnData], +): + adatas, _, _, lazy_adatas = concatenation_objects + concatenated_memory = simple_subset_func(ad.concat(adatas, join=join)) + concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) + # TODO: name is lost normally, should fix obs_memory = concatenated_memory.obs obs_memory.index.name = "obs_names" - assert_equal( *correct_extension_dtype_differences( - concated_remote[:M].obs.to_pandas(), concatenated_memory[:M].obs + concated_remote.obs.to_pandas(), concatenated_memory.obs ) ) - # check access count for the stores - only the first should be accessed - stores[0].assert_access_count("obs/int64", 1) - for i in range(1, n_datasets): - stores[i].assert_access_count("obs/int64", 0) - # subsetting should not read data into memory - concated_remote[:M].X - for i in range(n_datasets): - stores[i].assert_access_count("X", 0) +def test_concat_to_memory_obs_dtypes( + concatenation_objects: tuple[ + list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] + ], + join: Literal["outer", "inner"], +): + _, _, _, lazy_adatas = concatenation_objects + concated_remote = ad.concat(lazy_adatas, join=join) + # check preservation of non-categorical dtypes on the concat axis + assert concated_remote.obs["int64"].dtype == "int64" + assert concated_remote.obs["uint8"].dtype == "uint8" + assert concated_remote.obs["nullable-int"].dtype == "int32" + assert concated_remote.obs["float64"].dtype == "float64" + assert concated_remote.obs["bool"].dtype == "bool" + assert concated_remote.obs["nullable-bool"].dtype == "bool" + + +def test_concat_to_memory_var( + concatenation_objects: tuple[ + list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] + ], + join: Literal["outer", "inner"], + are_vars_different: bool, + simple_subset_func: Callable[[AnnData], AnnData], +): + adatas, var_indices, stores, lazy_adatas = concatenation_objects + concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) + var_keys_to_track = get_key_trackers_for_columns_on_axis(adatas[0], "var") + for store in stores: + store.initialize_key_trackers(var_keys_to_track) # check non-different variables, taken from first annotation. pd_index_overlapping = pd.Index( filter(lambda x: not x.endswith("ds"), var_indices[0]) @@ -331,11 +476,11 @@ def test_concat_access_count( if dtype in [np.float64, np.float32]: var_df[col] = var_df[col].astype(dtype) assert_equal(remote_df_corrected, var_df) - - stores[store_idx].assert_access_count("var/int64", 1) - for store in stores: - if store != stores[store_idx]: - store.assert_access_count("var/int64", 0) + for key in var_keys_to_track: + stores[store_idx].assert_access_count(key, 1) + for store in stores: + if store != stores[store_idx]: + store.assert_access_count(key, 0) stores[store_idx].reset_key_trackers() @@ -363,7 +508,12 @@ def test_concat_access_count( pytest.param(None, id="No index"), ], ) -def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation_index): +def test_concat_full_and_subsets( + adata_remote_orig: tuple[AnnData, AnnData], + join, + index: slice | NDArray | Literal["a"] | None, + load_annotation_index: bool, +): from anndata.experimental.backed._compat import Dataset2D remote, orig = adata_remote_orig @@ -379,14 +529,6 @@ def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation if np.isscalar(index) and index == "a": index = remote_concatenated.obs["obs_cat"] == "a" remote_concatenated = remote_concatenated[index] - assert isinstance(remote_concatenated.obs, Dataset2D) - # check preservation of non-categorical dtypes on the concat axis - assert remote_concatenated.obs["int64"].dtype == "int64" - assert remote_concatenated.obs["uint8"].dtype == "uint8" - assert remote_concatenated.obs["nullable-int"].dtype == "int32" - assert remote_concatenated.obs["float64"].dtype == "float64" - assert remote_concatenated.obs["bool"].dtype == "bool" - assert remote_concatenated.obs["nullable-bool"].dtype == "bool" orig_concatenated = ad.concat([orig, orig], join=join) if index is not None: orig_concatenated = orig_concatenated[index] @@ -394,6 +536,7 @@ def test_concat_full_and_subsets(adata_remote_orig, join, index, load_annotation corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( in_memory_remote_concatenated.obs, orig_concatenated.obs ) + assert isinstance(remote_concatenated.obs, Dataset2D) assert_equal(corrected_remote_obs, corrected_memory_obs) assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) assert ( From 62cda13066d6df2e355ac76dd018c35a5e1bbc1d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 16 Oct 2024 19:13:01 +0200 Subject: [PATCH 315/348] (fix): notebook submodule --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9e186c5c6..0af6cf336 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 +Subproject commit 0af6cf3363aed1cafd317516c8393136ee6287ae From 4e1a1f6962676150c1e418539ef8cff428f5ce44 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 10:00:51 +0200 Subject: [PATCH 316/348] (fix): use `find_spec` pattern --- src/anndata/experimental/backed/_compat.py | 26 +++++++++------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/anndata/experimental/backed/_compat.py b/src/anndata/experimental/backed/_compat.py index 634aa76c5..6c69cb051 100644 --- a/src/anndata/experimental/backed/_compat.py +++ b/src/anndata/experimental/backed/_compat.py @@ -1,33 +1,27 @@ from __future__ import annotations -try: +from importlib.util import find_spec +from typing import TYPE_CHECKING + +if find_spec("xarray") or TYPE_CHECKING: + import xarray from xarray import DataArray -except ImportError: + from xarray.backends import BackendArray + from xarray.backends.zarr import ZarrArrayWrapper + + +else: class DataArray: def __repr__(self) -> str: return "mock DataArray" - -try: - import xarray -except ImportError: xarray = None - -try: - from xarray.backends.zarr import ZarrArrayWrapper -except ImportError: - class ZarrArrayWrapper: def __repr__(self) -> str: return "mock ZarrArrayWrapper" - -try: - from xarray.backends import BackendArray -except ImportError: - class BackendArray: def __repr__(self) -> str: return "mock BackendArray" From a242dea890e0080772b353e81eb80dce5c850dc9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 10:04:42 +0200 Subject: [PATCH 317/348] (chore): re-insert types for `AccessTrackingStore` --- src/anndata/tests/helpers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 5a8df6460..bd9fd260c 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1051,13 +1051,17 @@ def __init__(self, *_args, **_kwargs) -> None: class AccessTrackingStore(DirectoryStore): + _access_count: Counter[str] + _accessed_keys: dict[str, list[str]] + _accessed: dict[str, set] + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = Counter() self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) - def __getitem__(self, key: str): + def __getitem__(self, key: str) -> object: for tracked in self._access_count: if tracked in key: self._access_count[tracked] += 1 From 07caf932bad449ca231795092b2da021775d0f10 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 10:06:04 +0200 Subject: [PATCH 318/348] (chore): dedent docstrings --- src/anndata/_types.py | 6 +++--- src/anndata/experimental/backed/_xarray.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 0e2b8f472..b0cde84c1 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -65,7 +65,7 @@ def __call__(self, elem: SCon) -> CovariantRWAble: The element to read from. Returns ------- - The element read from the store. + The element read from the store. """ ... @@ -84,7 +84,7 @@ def __call__( The chunk size to be used. Returns ------- - The lazy element read from the store. + The lazy element read from the store. """ ... @@ -152,7 +152,7 @@ def __call__( Returns ------- - The element read from the store. + The element read from the store. """ ... diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index e7394894a..3cb3f57cf 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -40,7 +40,7 @@ def index(self) -> pd.Index: Returns ------- - The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. + The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. """ coord = get_index_dim(self) return self.indexes[coord] @@ -56,7 +56,7 @@ def shape(self) -> tuple[int, int]: Returns ------- - The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. + The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. """ return (self.sizes[get_index_dim(self)], len(self)) @@ -66,7 +66,7 @@ def iloc(self): Returns ------- - Handler class for doing the iloc-style indexing using :meth:`~xarray.Dataset.isel`. + Handler class for doing the iloc-style indexing using :meth:`~xarray.Dataset.isel`. """ class IlocGetter: @@ -86,7 +86,7 @@ def columns(self) -> pd.Index: Returns ------- - :class:`pandas.Index` that represents the "columns." + :class:`pandas.Index` that represents the "columns." """ columns_list = list(self.keys()) return pd.Index(columns_list) From 8fd1fa0207061ac0ec0164a382ad758ebdd96fc5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 10:12:57 +0200 Subject: [PATCH 319/348] (chore): raise error if slots have changed on `ZarrOrHDF5Wrapper` --- src/anndata/experimental/backed/_lazy_arrays.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index ce85f1056..44b18168c 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -26,6 +26,10 @@ class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): def __init__(self, array: K): if isinstance(array, ZarrArray): return super().__init__(array) + if set(self.__slots__) != {"dtype", "shape", "_array"}: + msg = "Expected attributes of xarray ZarrArrayWrapper have changed - " + "please file an issue with anndata and consider downgrading xarray" + raise ValueError(msg) self._array = array self.shape = self._array.shape self.dtype = self._array.dtype From 94cf8eae85db8874c982a11c7404231fd9c40fa7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 10:51:23 +0200 Subject: [PATCH 320/348] (fix): add slots to please xarray --- src/anndata/experimental/backed/_xarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/anndata/experimental/backed/_xarray.py b/src/anndata/experimental/backed/_xarray.py index 3cb3f57cf..12a1e7b3f 100644 --- a/src/anndata/experimental/backed/_xarray.py +++ b/src/anndata/experimental/backed/_xarray.py @@ -34,6 +34,8 @@ def get_index_dim(ds: xr.DataArray) -> Hashable: class Dataset2D(Dataset): + __slots__ = () + @property def index(self) -> pd.Index: """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.index` so this ensures usability From 2c082bf3fd27b552f3ae3f8ae7a75a62ef95028e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 17 Oct 2024 15:25:32 +0200 Subject: [PATCH 321/348] (chore): remove redefinition --- tests/test_concatenate.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 10d4b6a42..8c143d2f6 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -1533,29 +1533,6 @@ def test_concat_different_types_dask(merge_strategy, array_type): assert_equal(result2, target2) -def test_concat_dask_sparse_matches_memory(join_type, merge_strategy): - import dask.array as da - from scipy import sparse - - import anndata as ad - - X = sparse.random(50, 20, density=0.5, format="csr") - X_dask = da.from_array(X, chunks=(5, 20)) - var_names_1 = [f"gene_{i}" for i in range(20)] - var_names_2 = [f"gene_{i}{'_foo' if (i%2) else ''}" for i in range(20, 40)] - - ad1 = ad.AnnData(X=X, var=pd.DataFrame(index=var_names_1)) - ad2 = ad.AnnData(X=X, var=pd.DataFrame(index=var_names_2)) - - ad1_dask = ad.AnnData(X=X_dask, var=pd.DataFrame(index=var_names_1)) - ad2_dask = ad.AnnData(X=X_dask, var=pd.DataFrame(index=var_names_2)) - - res_in_memory = ad.concat([ad1, ad2], join=join_type, merge=merge_strategy) - res_dask = ad.concat([ad1_dask, ad2_dask], join=join_type, merge=merge_strategy) - - assert_equal(res_in_memory, res_dask) - - def test_outer_concat_with_missing_value_for_df(): # https://github.com/scverse/anndata/issues/901 # TODO: Extend this test to cover all cases of missing values From 81c5fb9e2acd67acaebf1a77014ace55d51cc79b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 18 Oct 2024 12:12:13 +0200 Subject: [PATCH 322/348] (refactor): reuse join type --- src/anndata/_core/merge.py | 3 +-- src/anndata/_types.py | 2 ++ tests/test_read_lazy.py | 13 ++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index e4af0cf6f..a8359ab72 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -40,10 +40,9 @@ from pandas.api.extensions import ExtensionDtype + from anndata._types import Join_T from anndata.experimental.backed._compat import DataArray, Dataset2D - Join_T = Literal["inner", "outer"] - T = TypeVar("T") ################### diff --git a/src/anndata/_types.py b/src/anndata/_types.py index b0cde84c1..31e539118 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -202,3 +202,5 @@ def __call__( "raw", "uns", ] + +Join_T = Literal["inner", "outer"] diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index ef016a48b..ffc2f7604 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -29,6 +29,9 @@ from typing import Literal from numpy.typing import NDArray + + from anndata._types import Join_T + pytestmark = pytest.mark.skipif( not find_spec("xarray"), reason="Xarray is not installed" ) @@ -354,7 +357,7 @@ def test_concat_access_count( concatenation_objects: tuple[ list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], - join: Literal["outer", "inner"], + join: Join_T, elem_key: ANNDATA_ELEMS, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], @@ -384,7 +387,7 @@ def test_concat_to_memory_obs_access_count( concatenation_objects: tuple[ list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], - join: Literal["outer", "inner"], + join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): adatas, _, stores, lazy_adatas = concatenation_objects @@ -409,7 +412,7 @@ def test_concat_to_memory_obs( concatenation_objects: tuple[ list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], - join: Literal["outer", "inner"], + join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): adatas, _, _, lazy_adatas = concatenation_objects @@ -429,7 +432,7 @@ def test_concat_to_memory_obs_dtypes( concatenation_objects: tuple[ list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], - join: Literal["outer", "inner"], + join: Join_T, ): _, _, _, lazy_adatas = concatenation_objects concated_remote = ad.concat(lazy_adatas, join=join) @@ -446,7 +449,7 @@ def test_concat_to_memory_var( concatenation_objects: tuple[ list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], - join: Literal["outer", "inner"], + join: Join_T, are_vars_different: bool, simple_subset_func: Callable[[AnnData], AnnData], ): From bb49dd29a1f4392ae0ba4dfec52428345a91f443 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 18 Oct 2024 13:14:59 +0200 Subject: [PATCH 323/348] (fix): mixed type dataframe merging --- src/anndata/_core/merge.py | 15 +++++++---- tests/test_read_lazy.py | 51 +++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index a8359ab72..4048c1560 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -18,6 +18,7 @@ from scipy import sparse from scipy.sparse import spmatrix +from anndata._core.file_backing import to_memory from anndata._warnings import ExperimentalFeatureWarning from ..compat import ( @@ -208,6 +209,8 @@ def equal_awkward(a, b) -> bool: def as_sparse(x, use_sparse_array=False): + if isinstance(x, DaskArray): + x = x.compute() if not isinstance(x, sparse.spmatrix | SpArray): if CAN_USE_SPARSE_ARRAY and use_sparse_array: return sparse.csr_array(x) @@ -774,11 +777,13 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): fill_value = default_fill_value(arrays) if any(isinstance(a, Dataset2D) for a in arrays): - if not all(isinstance(a, Dataset2D) for a in arrays): - raise NotImplementedError( - "Cannot concatenate a Dataset2D with other array types." - ) - return concat_dataset2d_on_annot_axis(arrays, join="outer") + if all(isinstance(a, Dataset2D | pd.DataFrame) for a in arrays): + arrays = [to_memory(a) if isinstance(a, Dataset2D) else a for a in arrays] + elif not all(isinstance(a, Dataset2D) for a in arrays): + msg = f"Cannot concatenate a Dataset2D with other array types {[type(a) for a in arrays if not isinstance(a, Dataset2D)]}." + raise ValueError(msg) + else: + return concat_dataset2d_on_annot_axis(arrays, join="outer") if any(isinstance(a, pd.DataFrame) for a in arrays): # TODO: This is hacky, 0 is a sentinel for outer_concat_aligned_mapping if not all( diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index ffc2f7604..0f1744ce6 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -12,6 +12,7 @@ import anndata as ad from anndata import AnnData +from anndata._core.file_backing import to_memory from anndata._types import ANNDATA_ELEMS from anndata.compat import DaskArray from anndata.experimental import read_lazy @@ -81,15 +82,13 @@ def simple_subset_func(request): return request.param -# TODO: why does `read_lazy().to_memory()` cause `Dataset2D.to_memory()` to lose index name in -# multi-threaded tests when only opened once i.e., without this Callable? @pytest.fixture(scope="session") def adata_remote_orig_with_path( tmp_path_factory, diskfmt: str, mtx_format, worker_id: str = "serial", -) -> tuple[AnnData, AnnData]: +) -> tuple[Path, AnnData]: """Create remote fixtures, one without a range index and the other with""" file_name = f"orig_{worker_id}.{diskfmt}" if diskfmt == "h5ad": @@ -513,7 +512,7 @@ def test_concat_to_memory_var( ) def test_concat_full_and_subsets( adata_remote_orig: tuple[AnnData, AnnData], - join, + join: Join_T, index: slice | NDArray | Literal["a"] | None, load_annotation_index: bool, ): @@ -546,3 +545,47 @@ def test_concat_full_and_subsets( in_memory_remote_concatenated.var_names.tolist() == orig_concatenated.var_names.tolist() ) + + +@pytest.mark.parametrize( + "elem_key", + map( + lambda x: pytest.param(x, id="-".join(map(str, x))), + [("obs", None), ("var", None), ("obsm", "df"), ("varm", "df")], + ), +) +def test_concat_df_ds_mixed_types( + adata_remote_orig: tuple[AnnData, AnnData], + load_annotation_index: bool, + join: Join_T, + elem_key: tuple[str, str | None], +): + def elem_to_memory(adata: AnnData, elem_key: tuple[str, str | None]): + parent_elem = getattr(adata, elem_key[0]) + if elem_key[1] is not None: + getattr(adata, elem_key[0])[elem_key[1]] = to_memory( + parent_elem[elem_key[1]] + ) + return adata + else: + setattr(adata, elem_key[0], to_memory(parent_elem)) + return adata + + if not load_annotation_index: + pytest.skip( + "Testing for mixed types is independent of the axis since the indices always have to match." + ) + remote, orig = adata_remote_orig + remote = elem_to_memory(remote, elem_key) + in_memory_concatenated = ad.concat([orig, orig], join=join) + mixed_concatenated = ad.concat([remote, orig], join=join) + assert_equal(mixed_concatenated, in_memory_concatenated) + + +def test_concat_bad_mixed_types(tmp_path: str): + orig = gen_adata((100, 200), np.array) + orig.write_zarr(tmp_path) + remote = read_lazy(tmp_path) + orig.obsm["df"] = orig.obsm["array"] + with pytest.raises(ValueError, match=r"Cannot concatenate a Dataset2D*"): + ad.concat([remote, orig], join="outer") From 942661f92f3b212b74ae0401cb7961d538a4cfa9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 21 Oct 2024 11:10:13 +0200 Subject: [PATCH 324/348] (fix): condition for going to memory in mixed typing --- src/anndata/_core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 4048c1560..9e701184e 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -777,7 +777,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): fill_value = default_fill_value(arrays) if any(isinstance(a, Dataset2D) for a in arrays): - if all(isinstance(a, Dataset2D | pd.DataFrame) for a in arrays): + if any(isinstance(a, pd.DataFrame) for a in arrays): arrays = [to_memory(a) if isinstance(a, Dataset2D) else a for a in arrays] elif not all(isinstance(a, Dataset2D) for a in arrays): msg = f"Cannot concatenate a Dataset2D with other array types {[type(a) for a in arrays if not isinstance(a, Dataset2D)]}." From 99219c6b94c9ccbc8a8ff3597156e2e7cc1cf7bd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 21 Oct 2024 11:12:52 +0200 Subject: [PATCH 325/348] (refactor): mixed type helper function --- tests/test_read_lazy.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 0f1744ce6..e3513e77e 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -560,23 +560,24 @@ def test_concat_df_ds_mixed_types( join: Join_T, elem_key: tuple[str, str | None], ): - def elem_to_memory(adata: AnnData, elem_key: tuple[str, str | None]): + def with_elem_in_memory( + adata: AnnData, elem_key: tuple[str, str | None] + ) -> AnnData: parent_elem = getattr(adata, elem_key[0]) if elem_key[1] is not None: getattr(adata, elem_key[0])[elem_key[1]] = to_memory( parent_elem[elem_key[1]] ) return adata - else: - setattr(adata, elem_key[0], to_memory(parent_elem)) - return adata + setattr(adata, elem_key[0], to_memory(parent_elem)) + return adata if not load_annotation_index: pytest.skip( "Testing for mixed types is independent of the axis since the indices always have to match." ) remote, orig = adata_remote_orig - remote = elem_to_memory(remote, elem_key) + remote = with_elem_in_memory(remote, elem_key) in_memory_concatenated = ad.concat([orig, orig], join=join) mixed_concatenated = ad.concat([remote, orig], join=join) assert_equal(mixed_concatenated, in_memory_concatenated) From 98197febdd80c3cb9d75c36f7f69bbcd3711fe57 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 21 Oct 2024 11:44:51 +0200 Subject: [PATCH 326/348] (fix): try linking to dask/awkward in docs build --- docs/conf.py | 4 ---- pyproject.toml | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2da3d9027..5c485457f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -106,10 +106,6 @@ ("py:class", "anndata.compat.CupyArray"), ("py:class", "anndata.compat.CupySparseMatrix"), ("py:class", "numpy.ma.core.MaskedArray"), - ("py:class", "dask.array.core.Array"), - ("py:class", "awkward.highlevel.Array"), - # https://github.com/sphinx-doc/sphinx/issues/10591 - ("py:class", "awkward.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), # https://github.com/sphinx-doc/sphinx/issues/10974 diff --git a/pyproject.toml b/pyproject.toml index e70237591..245e3dc31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,8 @@ doc = [ "sphinx_design>=0.5.0", "readthedocs-sphinx-search", # for unreleased changes - "anndata[dev-doc]", + "anndata[dev-doc,dask]", + "awkward>=2.3" ] dev-doc = ["towncrier>=24.8.0"] # release notes tool test-full = ["anndata[test,lazy]"] From 752e02bea8d75e6126dcf07d99493af330960da2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 21 Oct 2024 12:13:57 +0200 Subject: [PATCH 327/348] (fix): awkward array docs --- docs/conf.py | 2 ++ src/anndata/_core/anndata.py | 4 ++-- src/anndata/experimental/backed/_io.py | 2 +- src/anndata/typing.py | 5 ++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5c485457f..1fd78a28d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -130,6 +130,7 @@ def setup(app: Sphinx): zarr=("https://zarr.readthedocs.io/en/stable", None), xarray=("https://docs.xarray.dev/en/stable", None), dask=("https://docs.dask.org/en/stable", None), + ak=("https://awkward-array.org/doc/stable/", None), ) qualname_overrides = { "h5py._hl.group.Group": "h5py.Group", @@ -140,6 +141,7 @@ def setup(app: Sphinx): "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", + "awkward.highlevel.Array": "ak.Array", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 46c286f9e..906497f1e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -51,7 +51,7 @@ from os import PathLike from typing import Any, Literal - from ..typing import ArrayDataStructureType + from ..typing import XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView from .index import Index, Index1D @@ -540,7 +540,7 @@ def shape(self) -> tuple[int, int]: return self.n_obs, self.n_vars @property - def X(self) -> ArrayDataStructureType | None: + def X(self) -> XDataType | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" if self.isbacked: if not self.file.is_open: diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index d11e3e9bd..55eabc9a7 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -29,7 +29,7 @@ def read_lazy( ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. - No array data should need to be read into memory with the exception of :class:`awkward.Array`, scalars, and some older-encoding arrays. + No array data should need to be read into memory with the exception of :class:`ak.Array`, scalars, and some older-encoding arrays. Parameters ---------- diff --git a/src/anndata/typing.py b/src/anndata/typing.py index d13927bad..ee6ff74fc 100644 --- a/src/anndata/typing.py +++ b/src/anndata/typing.py @@ -31,14 +31,12 @@ Index = _Index """1D or 2D index an :class:`~anndata.AnnData` object can be sliced with.""" - -ArrayDataStructureType: TypeAlias = ( +XDataType: TypeAlias = ( np.ndarray | ma.MaskedArray | sparse.csr_matrix | sparse.csc_matrix | SpArray - | AwkArray | H5Array | ZarrArray | ZappyArray @@ -48,6 +46,7 @@ | CupyArray | CupySparseMatrix ) +ArrayDataStructureType: TypeAlias = XDataType | AwkArray InMemoryArrayOrScalarType: TypeAlias = ( From 2a389008da0ff0cf8855c7154efa7eef44175a1a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Oct 2024 11:19:50 +0200 Subject: [PATCH 328/348] (chore): `ValueError` -> `AssertionError` --- src/anndata/experimental/backed/_lazy_arrays.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 44b18168c..03cc8566b 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -29,7 +29,7 @@ def __init__(self, array: K): if set(self.__slots__) != {"dtype", "shape", "_array"}: msg = "Expected attributes of xarray ZarrArrayWrapper have changed - " "please file an issue with anndata and consider downgrading xarray" - raise ValueError(msg) + raise AssertionError(msg) self._array = array self.shape = self._array.shape self.dtype = self._array.dtype From cc40369364b89e6edf13f26e1a2a9afb54c0f73c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Oct 2024 11:49:51 +0200 Subject: [PATCH 329/348] (fix): clean up `_lazy_arrays.py` --- .../experimental/backed/_lazy_arrays.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 03cc8566b..6495c4545 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -16,6 +16,8 @@ if TYPE_CHECKING: from typing import Literal + import numpy as np + from anndata._core.index import Index @@ -34,7 +36,7 @@ def __init__(self, array: K): self.shape = self._array.shape self.dtype = self._array.dtype - def __getitem__(self, key): + def __getitem__(self, key: xr.core.indexing.ExplicitIndexer): if isinstance(self._array, ZarrArray): return super().__getitem__(key) # adapted from https://github.com/pydata/xarray/blob/main/xarray/backends/h5netcdf_.py#L50-L58C13 @@ -66,7 +68,7 @@ def __init__( self.shape = self._codes.shape @cached_property - def categories(self): + def categories(self) -> np.ndarray: if isinstance(self._categories, ZarrArray): return self._categories[...] from ..._io.h5ad import read_dataset @@ -106,19 +108,19 @@ def __init__( self._dtype_str = dtype_str self.shape = self._values.shape - def __getitem__(self, key) -> xr.core.extension_array.PandasExtensionArray: + def __getitem__( + self, key: xr.core.indexing.ExplicitIndexer + ) -> xr.core.extension_array.PandasExtensionArray: values = self._values[key] - if self._mask is not None: - mask = self._mask[key] - if self._dtype_str == "nullable-integer": - # numpy does not support nan ints - extension_array = pd.arrays.IntegerArray(values, mask=mask) - elif self._dtype_str == "nullable-boolean": - extension_array = pd.arrays.BooleanArray(values, mask=mask) - else: - raise ValueError(f"Invalid dtype_str {self._dtype_str}") - return xr.core.extension_array.PandasExtensionArray(extension_array) - return xr.core.extension_array.PandasExtensionArray(pd.array(values)) + mask = self._mask[key] + if self._dtype_str == "nullable-integer": + # numpy does not support nan ints + extension_array = pd.arrays.IntegerArray(values, mask=mask) + elif self._dtype_str == "nullable-boolean": + extension_array = pd.arrays.BooleanArray(values, mask=mask) + else: + raise RuntimeError(f"Invalid dtype_str {self._dtype_str}") + return xr.core.extension_array.PandasExtensionArray(extension_array) @cached_property def dtype(self): @@ -129,8 +131,7 @@ def dtype(self): ).dtype elif self._dtype_str == "nullable-boolean": return pd.BooleanDtype() - else: - raise ValueError(f"Invalid dtype_str {self._dtype_str}") + raise RuntimeError(f"Invalid dtype_str {self._dtype_str}") @_subset.register(DataArray) From a807673e63aa0537edc2d22791f86dc6c6c3e636 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Oct 2024 12:03:45 +0200 Subject: [PATCH 330/348] (fix): `ValueError`->`KeyError` for store --- src/anndata/tests/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index bd9fd260c..b66587612 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1073,17 +1073,17 @@ def get_access_count(self, key: str) -> int: # access defaultdict when value is not there causes key to be there, # which causes it to be tracked if key not in self._access_count: - raise ValueError(f"{key} not found among access count") + raise KeyError(f"{key} not found among access count") return self._access_count[key] def get_subkeys_accessed(self, key: str) -> set[str]: if key not in self._accessed: - raise ValueError(f"{key} not found among accessed") + raise KeyError(f"{key} not found among accessed") return self._accessed[key] def get_accessed_keys(self, key: str) -> list[str]: if key not in self._accessed_keys: - raise ValueError(f"{key} not found among accessed keys") + raise KeyError(f"{key} not found among accessed keys") return self._accessed_keys[key] def initialize_key_trackers(self, keys_to_track: Collection[str]): From 852ab200be0e5ba3a7ac1a48f25e382615c87461 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Oct 2024 12:23:47 +0200 Subject: [PATCH 331/348] (chore): add note about `unify_extension_dtypes` --- tests/test_read_lazy.py | 53 +++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index e3513e77e..792cc7cde 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -331,8 +331,24 @@ def test_unconsolidated(tmp_path: Path, mtx_format): store.assert_access_count("obs/.zgroup", 1) -# remote has object dtype, need to convert back for integers booleans etc. -def correct_extension_dtype_differences(remote: pd.DataFrame, memory: pd.DataFrame): +def unify_extension_dtypes( + remote: pd.DataFrame, memory: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + For concatenated lazy datasets, we send the extension arrays through dask + But this means we lose the pandas dtype, so this function corrects that. + + Parameters + ---------- + remote + The dataset that comes from the concatenated lazy operation + memory + The in-memory, "correct" version + + Returns + ------- + The two dataframes unified + """ for col in memory.columns: dtype = memory[col].dtype if pd.api.types.is_extension_array_dtype(dtype): @@ -421,7 +437,7 @@ def test_concat_to_memory_obs( obs_memory = concatenated_memory.obs obs_memory.index.name = "obs_names" assert_equal( - *correct_extension_dtype_differences( + *unify_extension_dtypes( concated_remote.obs.to_pandas(), concatenated_memory.obs ) ) @@ -471,7 +487,7 @@ def test_concat_to_memory_var( for pd_index, var_df, store_idx in test_cases: var_df.index.name = "var_names" remote_df = concated_remote[:, pd_index].var.to_pandas() - remote_df_corrected, _ = correct_extension_dtype_differences(remote_df, var_df) + remote_df_corrected, _ = unify_extension_dtypes(remote_df, var_df) # TODO:xr.merge always upcasts to float due to NA and you can't downcast? for col in remote_df_corrected.columns: dtype = remote_df_corrected[col].dtype @@ -535,7 +551,7 @@ def test_concat_full_and_subsets( if index is not None: orig_concatenated = orig_concatenated[index] in_memory_remote_concatenated = remote_concatenated.to_memory() - corrected_remote_obs, corrected_memory_obs = correct_extension_dtype_differences( + corrected_remote_obs, corrected_memory_obs = unify_extension_dtypes( in_memory_remote_concatenated.obs, orig_concatenated.obs ) assert isinstance(remote_concatenated.obs, Dataset2D) @@ -548,28 +564,25 @@ def test_concat_full_and_subsets( @pytest.mark.parametrize( - "elem_key", - map( - lambda x: pytest.param(x, id="-".join(map(str, x))), - [("obs", None), ("var", None), ("obsm", "df"), ("varm", "df")], + ("attr", "key"), + ( + pytest.param(param[0], param[1], id="-".join(map(str, param))) + for param in [("obs", None), ("var", None), ("obsm", "df"), ("varm", "df")] ), ) def test_concat_df_ds_mixed_types( adata_remote_orig: tuple[AnnData, AnnData], load_annotation_index: bool, join: Join_T, - elem_key: tuple[str, str | None], + attr: str, + key: str | None, ): - def with_elem_in_memory( - adata: AnnData, elem_key: tuple[str, str | None] - ) -> AnnData: - parent_elem = getattr(adata, elem_key[0]) - if elem_key[1] is not None: - getattr(adata, elem_key[0])[elem_key[1]] = to_memory( - parent_elem[elem_key[1]] - ) + def with_elem_in_memory(adata: AnnData, attr: str, key: str | None) -> AnnData: + parent_elem = getattr(adata, attr) + if key is not None: + getattr(adata, attr)[key] = to_memory(parent_elem[key]) return adata - setattr(adata, elem_key[0], to_memory(parent_elem)) + setattr(adata, attr, to_memory(parent_elem)) return adata if not load_annotation_index: @@ -577,7 +590,7 @@ def with_elem_in_memory( "Testing for mixed types is independent of the axis since the indices always have to match." ) remote, orig = adata_remote_orig - remote = with_elem_in_memory(remote, elem_key) + remote = with_elem_in_memory(remote, attr, key) in_memory_concatenated = ad.concat([orig, orig], join=join) mixed_concatenated = ad.concat([remote, orig], join=join) assert_equal(mixed_concatenated, in_memory_concatenated) From 310191c07a1e727171146cc3446d87ef3c299d94 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Oct 2024 12:37:22 +0200 Subject: [PATCH 332/348] (chore): add ids --- tests/test_read_lazy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 792cc7cde..47604739e 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -61,7 +61,11 @@ def diskfmt(request): return request.param -@pytest.fixture(params=[True, False], scope="session") +@pytest.fixture( + params=[True, False], + scope="session", + ids=["load-annotation-index", "dont-load-annotation-index"], +) def load_annotation_index(request): return request.param From 1c15b70de343993b937a92cfbdd5a6ae78b1814a Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 31 Oct 2024 12:46:23 +0100 Subject: [PATCH 333/348] Apply suggestions from code review Co-authored-by: Philipp A. --- src/anndata/_io/specs/lazy_methods.py | 2 +- src/anndata/_types.py | 2 +- src/anndata/tests/helpers.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e64881274..47454e6fd 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -194,7 +194,7 @@ def _gen_xarray_dict_iterator_from_elems( index_label: str, index_key: str, index: np.NDArray, -) -> Generator[tuple[str, DataArray]]: +) -> Generator[tuple[str, DataArray], None, None]: from anndata.experimental.backed._compat import DataArray from anndata.experimental.backed._compat import xarray as xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 31e539118..92a758dad 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -190,7 +190,7 @@ def __call__( ... -ANNDATA_ELEMS = Literal[ +AnnDataElem = Literal[ "obs", "var", "obsm", diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index b66587612..d06887ede 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1052,8 +1052,8 @@ def __init__(self, *_args, **_kwargs) -> None: class AccessTrackingStore(DirectoryStore): _access_count: Counter[str] - _accessed_keys: dict[str, list[str]] - _accessed: dict[str, set] + _accessed: defaultdict[str, set] + _accessed_keys: defaultdict[str, list[str]] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From b96bd5571545f68ee646b18c3814598c04deb262 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 12:47:52 +0100 Subject: [PATCH 334/348] (fix): move all changes form anndata_elem --- src/anndata/experimental/backed/_io.py | 4 ++-- tests/test_read_lazy.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 55eabc9a7..343248fca 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -8,7 +8,7 @@ import h5py from anndata._io.specs.registry import read_elem_lazy -from anndata._types import ANNDATA_ELEMS +from anndata._types import AnnDataElem from ..._core.anndata import AnnData from ..._settings import settings @@ -124,7 +124,7 @@ def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec iter_object = ( elem.items() if has_keys - else [(k, elem[k]) for k in typing.get_args(ANNDATA_ELEMS) if k in elem] + else [(k, elem[k]) for k in typing.get_args(AnnDataElem) if k in elem] ) return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) elif ( diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 47604739e..f15f598ae 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -13,7 +13,7 @@ import anndata as ad from anndata import AnnData from anndata._core.file_backing import to_memory -from anndata._types import ANNDATA_ELEMS +from anndata._types import AnnDataElem from anndata.compat import DaskArray from anndata.experimental import read_lazy from anndata.tests.helpers import ( @@ -37,7 +37,7 @@ not find_spec("xarray"), reason="Xarray is not installed" ) -ANNDATA_ELEMS_LIST = typing.get_args(ANNDATA_ELEMS) +ANNDATA_ELEMS = typing.get_args(AnnDataElem) @pytest.fixture( @@ -212,12 +212,12 @@ def get_key_trackers_for_columns_on_axis( ("raw", "X"), ("obs", "cat"), ("obs", "int64"), - *((elem_name, None) for elem_name in ANNDATA_ELEMS_LIST), + *((elem_name, None) for elem_name in ANNDATA_ELEMS), ], ) def test_access_count_elem_access( adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], - elem_key: ANNDATA_ELEMS, + elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): @@ -236,7 +236,7 @@ def test_access_count_subset( adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], ): remote, store = adata_remote_with_store_tall_skinny - non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS_LIST) + non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS) store.initialize_key_trackers(["obs/cat/codes", *non_obs_elem_names]) remote[remote.obs["cat"] == "a", :] # all codes read in for subset (from 1 chunk) @@ -360,7 +360,7 @@ def unify_extension_dtypes( return remote, memory -ANNDATA_ELEMS_LIST = typing.get_args(ANNDATA_ELEMS) +ANNDATA_ELEMS = typing.get_args(AnnDataElem) @pytest.mark.parametrize("join", ["outer", "inner"]) @@ -369,7 +369,7 @@ def unify_extension_dtypes( [ ("obs", "cat"), ("obs", "int64"), - *((elem_name, None) for elem_name in ANNDATA_ELEMS_LIST), + *((elem_name, None) for elem_name in ANNDATA_ELEMS), ], ) def test_concat_access_count( @@ -377,7 +377,7 @@ def test_concat_access_count( list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] ], join: Join_T, - elem_key: ANNDATA_ELEMS, + elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): @@ -388,7 +388,7 @@ def test_concat_access_count( for elem in ["obs", "var"] for col in adatas[0].obs.columns ) - non_obs_var_keys = filter(lambda e: e not in {"obs", "var"}, ANNDATA_ELEMS_LIST) + non_obs_var_keys = filter(lambda e: e not in {"obs", "var"}, ANNDATA_ELEMS) keys_to_track = [*non_categorical_columns, *non_obs_var_keys] for store in stores: store.initialize_key_trackers(keys_to_track) From d1fce7e54ac69afd67a549b169e59a72e53a1de6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 13:41:52 +0100 Subject: [PATCH 335/348] (fix): `read_elem_as_dask`->`read_elem_lazy` --- src/anndata/_io/specs/registry.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index b567a834c..db463449c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -438,18 +438,16 @@ def read_elem_lazy( Reading a sparse matrix from a zarr store lazily, with custom chunk size and default: >>> g = zarr.open(zarr_path) - >>> adata.X = ad.experimental.read_elem_as_dask(g["X"]) + >>> adata.X = ad.experimental.read_elem_lazy(g["X"]) >>> adata.X dask.array - >>> adata.X = ad.experimental.read_elem_as_dask( - ... g["X"], chunks=(500, adata.shape[1]) - ... ) + >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1])) >>> adata.X dask.array Reading a dense matrix from a zarr store lazily: - >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + >>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"]) >>> adata.layers["dense"] dask.array @@ -462,10 +460,8 @@ def read_elem_lazy( ... obsm=ad.io.read_elem(g["obsm"]), ... varm=ad.io.read_elem(g["varm"]), ... ) - >>> adata.X = ad.experimental.read_elem_as_dask( - ... g["X"], chunks=(500, adata.shape[1]) - ... ) - >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1])) + >>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"]) """ return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks, **kwargs) From 52b6a01bea50520481e27edd8c1a93ad6156a500 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 13:46:16 +0100 Subject: [PATCH 336/348] (chore): refactor `test_read_lazy` fixtures --- tests/test_read_lazy.py | 273 +++++++++++++++++++++++----------------- 1 file changed, 156 insertions(+), 117 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index f15f598ae..d52891d40 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -106,11 +106,17 @@ def adata_remote_orig_with_path( @pytest.fixture -def adata_remote_orig( +def adata_remote( adata_remote_orig_with_path: tuple[Path, AnnData], load_annotation_index: bool -) -> tuple[AnnData, AnnData]: - orig_path, orig = adata_remote_orig_with_path - return read_lazy(orig_path, load_annotation_index=load_annotation_index), orig +) -> AnnData: + orig_path, _ = adata_remote_orig_with_path + return read_lazy(orig_path, load_annotation_index=load_annotation_index) + + +@pytest.fixture +def adata_orig(adata_remote_orig_with_path: tuple[Path, AnnData]) -> AnnData: + _, orig = adata_remote_orig_with_path + return orig @pytest.fixture(scope="session") @@ -168,13 +174,34 @@ def adatas_paths_var_indices_for_concatenation( @pytest.fixture -def concatenation_objects( +def var_indices_for_concat( + adatas_paths_var_indices_for_concatenation, +) -> list[pd.Index]: + _, _, var_indices = adatas_paths_var_indices_for_concatenation + return var_indices + + +@pytest.fixture +def adatas_for_concat( + adatas_paths_var_indices_for_concatenation, +) -> list[AnnData]: + adatas, _, _ = adatas_paths_var_indices_for_concatenation + return adatas + + +@pytest.fixture +def stores_for_concat( adatas_paths_var_indices_for_concatenation, -) -> tuple[list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData]]: - adatas, paths, var_indices = adatas_paths_var_indices_for_concatenation - stores = [AccessTrackingStore(path) for path in paths] - lazys = [read_lazy(store) for store in stores] - return adatas, var_indices, stores, lazys +) -> list[AccessTrackingStore]: + _, paths, _ = adatas_paths_var_indices_for_concatenation + return [AccessTrackingStore(path) for path in paths] + + +@pytest.fixture +def lazy_adatas_for_concat( + stores_for_concat, +) -> list[AnnData]: + return [read_lazy(store) for store in stores_for_concat] @pytest.fixture @@ -186,6 +213,21 @@ def adata_remote_with_store_tall_skinny( return remote, store +@pytest.fixture +def remote_store_tall_skinny( + adata_remote_with_store_tall_skinny_path: Path, +) -> AccessTrackingStore: + return AccessTrackingStore(adata_remote_with_store_tall_skinny_path) + + +@pytest.fixture +def adata_remote_tall_skinny( + remote_store_tall_skinny: AccessTrackingStore, +) -> AnnData: + remote = read_lazy(remote_store_tall_skinny) + return remote + + def get_key_trackers_for_columns_on_axis( adata: AnnData, axis: Literal["obs", "var"] ) -> Generator[str, None, None]: @@ -216,108 +258,107 @@ def get_key_trackers_for_columns_on_axis( ], ) def test_access_count_elem_access( - adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + remote_store_tall_skinny: AccessTrackingStore, + adata_remote_tall_skinny: AnnData, elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): - remote, store = adata_remote_with_store_tall_skinny full_path = f"{elem_key}/{sub_key}" if sub_key is not None else elem_key - store.initialize_key_trackers({full_path, "X"}) + remote_store_tall_skinny.initialize_key_trackers({full_path, "X"}) # a series of methods that should __not__ read in any data - elem = getattr(simple_subset_func(remote), elem_key) + elem = getattr(simple_subset_func(adata_remote_tall_skinny), elem_key) if sub_key is not None: getattr(elem, sub_key) - store.assert_access_count(full_path, 0) - store.assert_access_count("X", 0) + remote_store_tall_skinny.assert_access_count(full_path, 0) + remote_store_tall_skinny.assert_access_count("X", 0) def test_access_count_subset( - adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + remote_store_tall_skinny: AccessTrackingStore, + adata_remote_tall_skinny: AnnData, ): - remote, store = adata_remote_with_store_tall_skinny non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS) - store.initialize_key_trackers(["obs/cat/codes", *non_obs_elem_names]) - remote[remote.obs["cat"] == "a", :] + remote_store_tall_skinny.initialize_key_trackers( + ["obs/cat/codes", *non_obs_elem_names] + ) + adata_remote_tall_skinny[adata_remote_tall_skinny.obs["cat"] == "a", :] # all codes read in for subset (from 1 chunk) - store.assert_access_count("obs/cat/codes", 1) + remote_store_tall_skinny.assert_access_count("obs/cat/codes", 1) for elem_name in non_obs_elem_names: - store.assert_access_count(elem_name, 0) + remote_store_tall_skinny.assert_access_count(elem_name, 0) def test_access_count_subset_column_compute( - adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + remote_store_tall_skinny: AccessTrackingStore, + adata_remote_tall_skinny: AnnData, ): - remote, store = adata_remote_with_store_tall_skinny - store.initialize_key_trackers(["obs/int64"]) - remote[remote.shape[0] // 2, :].obs["int64"].compute() + remote_store_tall_skinny.initialize_key_trackers(["obs/int64"]) + adata_remote_tall_skinny[adata_remote_tall_skinny.shape[0] // 2, :].obs[ + "int64" + ].compute() # two chunks needed for 0:10 subset - store.assert_access_count("obs/int64", 1) + remote_store_tall_skinny.assert_access_count("obs/int64", 1) def test_access_count_index( - adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + remote_store_tall_skinny: AccessTrackingStore, ): - _, store = adata_remote_with_store_tall_skinny - store.initialize_key_trackers(["obs/_index"]) - read_lazy(store, load_annotation_index=False) - store.assert_access_count("obs/_index", 0) - read_lazy(store) + remote_store_tall_skinny.initialize_key_trackers(["obs/_index"]) + read_lazy(remote_store_tall_skinny, load_annotation_index=False) + remote_store_tall_skinny.assert_access_count("obs/_index", 0) + read_lazy(remote_store_tall_skinny) # 4 is number of chunks - store.assert_access_count("obs/_index", 4) + remote_store_tall_skinny.assert_access_count("obs/_index", 4) def test_access_count_dtype( - adata_remote_with_store_tall_skinny: tuple[AnnData, AccessTrackingStore], + remote_store_tall_skinny: AccessTrackingStore, + adata_remote_tall_skinny: AnnData, ): - remote, store = adata_remote_with_store_tall_skinny - store.initialize_key_trackers(["obs/cat/categories"]) - store.assert_access_count("obs/cat/categories", 0) + remote_store_tall_skinny.initialize_key_trackers(["obs/cat/categories"]) + remote_store_tall_skinny.assert_access_count("obs/cat/categories", 0) # This should only cause categories to be read in once - remote.obs["cat"].dtype - remote.obs["cat"].dtype - remote.obs["cat"].dtype - store.assert_access_count("obs/cat/categories", 1) + adata_remote_tall_skinny.obs["cat"].dtype + adata_remote_tall_skinny.obs["cat"].dtype + adata_remote_tall_skinny.obs["cat"].dtype + remote_store_tall_skinny.assert_access_count("obs/cat/categories", 1) -def test_uns_uses_dask(adata_remote_orig: tuple[AnnData, AnnData]): - remote, _ = adata_remote_orig - assert isinstance(remote.uns["nested"]["nested_further"]["array"], DaskArray) +def test_uns_uses_dask(adata_remote: AnnData): + assert isinstance(adata_remote.uns["nested"]["nested_further"]["array"], DaskArray) -def test_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): - remote, orig = adata_remote_orig - remote_to_memory = remote.to_memory() - assert_equal(remote_to_memory, orig) +def test_to_memory(adata_remote: AnnData, adata_orig: AnnData): + remote_to_memory = adata_remote.to_memory() + assert_equal(remote_to_memory, adata_orig) -def test_view_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): - remote, orig = adata_remote_orig - subset_obs = orig.obs["obs_cat"] == "a" - assert_equal(orig[subset_obs, :], remote[subset_obs, :].to_memory()) +def test_view_to_memory(adata_remote: AnnData, adata_orig: AnnData): + subset_obs = adata_orig.obs["obs_cat"] == "a" + assert_equal(adata_orig[subset_obs, :], adata_remote[subset_obs, :].to_memory()) - subset_var = orig.var["var_cat"] == "a" - assert_equal(orig[:, subset_var], remote[:, subset_var].to_memory()) + subset_var = adata_orig.var["var_cat"] == "a" + assert_equal(adata_orig[:, subset_var], adata_remote[:, subset_var].to_memory()) -def test_view_of_view_to_memory(adata_remote_orig: tuple[AnnData, AnnData]): - remote, orig = adata_remote_orig - subset_obs = (orig.obs["obs_cat"] == "a") | (orig.obs["obs_cat"] == "b") - subsetted_adata = orig[subset_obs, :] +def test_view_of_view_to_memory(adata_remote: AnnData, adata_orig: AnnData): + subset_obs = (adata_orig.obs["obs_cat"] == "a") | (adata_orig.obs["obs_cat"] == "b") + subsetted_adata = adata_orig[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] assert_equal( subsetted_subsetted_adata, - remote[subset_obs, :][subset_subset_obs, :].to_memory(), + adata_remote[subset_obs, :][subset_subset_obs, :].to_memory(), ) - subset_var = (orig.var["var_cat"] == "a") | (orig.var["var_cat"] == "b") - subsetted_adata = orig[:, subset_var] + subset_var = (adata_orig.var["var_cat"] == "a") | (adata_orig.var["var_cat"] == "b") + subsetted_adata = adata_orig[:, subset_var] subset_subset_var = subsetted_adata.var["var_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] assert_equal( subsetted_subsetted_adata, - remote[:, subset_var][:, subset_subset_var].to_memory(), + adata_remote[:, subset_var][:, subset_subset_var].to_memory(), ) @@ -373,70 +414,68 @@ def unify_extension_dtypes( ], ) def test_concat_access_count( - concatenation_objects: tuple[ - list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] - ], + adatas_for_concat: list[AnnData], + stores_for_concat: list[AccessTrackingStore], + lazy_adatas_for_concat: list[AnnData], join: Join_T, elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): - adatas, _, stores, lazy_adatas = concatenation_objects # track all elems except codes because they must be read in for concatenation non_categorical_columns = ( f"{elem}/{col}" if "cat" not in col else f"{elem}/{col}/codes" for elem in ["obs", "var"] - for col in adatas[0].obs.columns + for col in adatas_for_concat[0].obs.columns ) non_obs_var_keys = filter(lambda e: e not in {"obs", "var"}, ANNDATA_ELEMS) keys_to_track = [*non_categorical_columns, *non_obs_var_keys] - for store in stores: + for store in stores_for_concat: store.initialize_key_trackers(keys_to_track) - concated_remote = ad.concat(lazy_adatas, join=join) + concated_remote = ad.concat(lazy_adatas_for_concat, join=join) # a series of methods that should __not__ read in any data elem = getattr(simple_subset_func(concated_remote), elem_key) if sub_key is not None: getattr(elem, sub_key) - for store in stores: + for store in stores_for_concat: for elem in keys_to_track: store.assert_access_count(elem, 0) def test_concat_to_memory_obs_access_count( - concatenation_objects: tuple[ - list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] - ], + adatas_for_concat: list[AnnData], + stores_for_concat: list[AccessTrackingStore], + lazy_adatas_for_concat: list[AnnData], join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): - adatas, _, stores, lazy_adatas = concatenation_objects - concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) + concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) concated_remote_subset = simple_subset_func(concated_remote) - n_datasets = len(adatas) - obs_keys_to_track = get_key_trackers_for_columns_on_axis(adatas[0], "obs") - for store in stores: + n_datasets = len(adatas_for_concat) + obs_keys_to_track = get_key_trackers_for_columns_on_axis( + adatas_for_concat[0], "obs" + ) + for store in stores_for_concat: store.initialize_key_trackers(obs_keys_to_track) concated_remote_subset.to_memory() # check access count for the stores - only the first should be accessed when reading into memory for col in obs_keys_to_track: - stores[0].assert_access_count(col, 1) + stores_for_concat[0].assert_access_count(col, 1) for i in range(1, n_datasets): # if the shapes are the same, data was read in to bring the object into memory; otherwise, not - stores[i].assert_access_count( + stores_for_concat[i].assert_access_count( col, concated_remote_subset.shape[0] == concated_remote.shape[0] ) def test_concat_to_memory_obs( - concatenation_objects: tuple[ - list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] - ], + adatas_for_concat: list[AnnData], + lazy_adatas_for_concat: list[AnnData], join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): - adatas, _, _, lazy_adatas = concatenation_objects - concatenated_memory = simple_subset_func(ad.concat(adatas, join=join)) - concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) + concatenated_memory = simple_subset_func(ad.concat(adatas_for_concat, join=join)) + concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) # TODO: name is lost normally, should fix obs_memory = concatenated_memory.obs obs_memory.index.name = "obs_names" @@ -448,13 +487,10 @@ def test_concat_to_memory_obs( def test_concat_to_memory_obs_dtypes( - concatenation_objects: tuple[ - list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] - ], + lazy_adatas_for_concat: list[AnnData], join: Join_T, ): - _, _, _, lazy_adatas = concatenation_objects - concated_remote = ad.concat(lazy_adatas, join=join) + concated_remote = ad.concat(lazy_adatas_for_concat, join=join) # check preservation of non-categorical dtypes on the concat axis assert concated_remote.obs["int64"].dtype == "int64" assert concated_remote.obs["uint8"].dtype == "uint8" @@ -465,28 +501,32 @@ def test_concat_to_memory_obs_dtypes( def test_concat_to_memory_var( - concatenation_objects: tuple[ - list[AnnData], list[pd.Index], list[AccessTrackingStore], list[AnnData] - ], + var_indices_for_concat: list[pd.Index], + adatas_for_concat: list[AnnData], + stores_for_concat: list[AccessTrackingStore], + lazy_adatas_for_concat: list[AnnData], join: Join_T, are_vars_different: bool, simple_subset_func: Callable[[AnnData], AnnData], ): - adatas, var_indices, stores, lazy_adatas = concatenation_objects - concated_remote = simple_subset_func(ad.concat(lazy_adatas, join=join)) - var_keys_to_track = get_key_trackers_for_columns_on_axis(adatas[0], "var") - for store in stores: + concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) + var_keys_to_track = get_key_trackers_for_columns_on_axis( + adatas_for_concat[0], "var" + ) + for store in stores_for_concat: store.initialize_key_trackers(var_keys_to_track) # check non-different variables, taken from first annotation. pd_index_overlapping = pd.Index( - filter(lambda x: not x.endswith("ds"), var_indices[0]) + filter(lambda x: not x.endswith("ds"), var_indices_for_concat[0]) ) - var_df_overlapping = adatas[0][:, pd_index_overlapping].var.copy() + var_df_overlapping = adatas_for_concat[0][:, pd_index_overlapping].var.copy() test_cases = [(pd_index_overlapping, var_df_overlapping, 0)] if are_vars_different and join == "outer": # check a set of unique variables from the first object since we only take from there if different - pd_index_only_ds_0 = pd.Index(filter(lambda x: "0_ds" in x, var_indices[1])) - var_df_only_ds_0 = adatas[0][:, pd_index_only_ds_0].var.copy() + pd_index_only_ds_0 = pd.Index( + filter(lambda x: "0_ds" in x, var_indices_for_concat[1]) + ) + var_df_only_ds_0 = adatas_for_concat[0][:, pd_index_only_ds_0].var.copy() test_cases.append((pd_index_only_ds_0, var_df_only_ds_0, 0)) for pd_index, var_df, store_idx in test_cases: var_df.index.name = "var_names" @@ -499,11 +539,11 @@ def test_concat_to_memory_var( var_df[col] = var_df[col].astype(dtype) assert_equal(remote_df_corrected, var_df) for key in var_keys_to_track: - stores[store_idx].assert_access_count(key, 1) - for store in stores: - if store != stores[store_idx]: + stores_for_concat[store_idx].assert_access_count(key, 1) + for store in stores_for_concat: + if store != stores_for_concat[store_idx]: store.assert_access_count(key, 0) - stores[store_idx].reset_key_trackers() + stores_for_concat[store_idx].reset_key_trackers() @pytest.mark.parametrize( @@ -531,27 +571,26 @@ def test_concat_to_memory_var( ], ) def test_concat_full_and_subsets( - adata_remote_orig: tuple[AnnData, AnnData], + adata_remote: AnnData, + adata_orig: AnnData, join: Join_T, index: slice | NDArray | Literal["a"] | None, load_annotation_index: bool, ): from anndata.experimental.backed._compat import Dataset2D - remote, orig = adata_remote_orig - maybe_warning_context = ( pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric") if not load_annotation_index else nullcontext() ) with maybe_warning_context: - remote_concatenated = ad.concat([remote, remote], join=join) + remote_concatenated = ad.concat([adata_remote, adata_remote], join=join) if index is not None: if np.isscalar(index) and index == "a": index = remote_concatenated.obs["obs_cat"] == "a" remote_concatenated = remote_concatenated[index] - orig_concatenated = ad.concat([orig, orig], join=join) + orig_concatenated = ad.concat([adata_orig, adata_orig], join=join) if index is not None: orig_concatenated = orig_concatenated[index] in_memory_remote_concatenated = remote_concatenated.to_memory() @@ -575,7 +614,8 @@ def test_concat_full_and_subsets( ), ) def test_concat_df_ds_mixed_types( - adata_remote_orig: tuple[AnnData, AnnData], + adata_remote: AnnData, + adata_orig: AnnData, load_annotation_index: bool, join: Join_T, attr: str, @@ -593,10 +633,9 @@ def with_elem_in_memory(adata: AnnData, attr: str, key: str | None) -> AnnData: pytest.skip( "Testing for mixed types is independent of the axis since the indices always have to match." ) - remote, orig = adata_remote_orig - remote = with_elem_in_memory(remote, attr, key) - in_memory_concatenated = ad.concat([orig, orig], join=join) - mixed_concatenated = ad.concat([remote, orig], join=join) + remote = with_elem_in_memory(adata_remote, attr, key) + in_memory_concatenated = ad.concat([adata_orig, adata_orig], join=join) + mixed_concatenated = ad.concat([remote, adata_orig], join=join) assert_equal(mixed_concatenated, in_memory_concatenated) From a796d9b90b220c1a418798fea18144eaffdffa33 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 31 Oct 2024 13:53:25 +0100 Subject: [PATCH 337/348] Update tests/test_read_lazy.py Co-authored-by: Philipp A. --- tests/test_read_lazy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index f15f598ae..d3b43b055 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -540,12 +540,11 @@ def test_concat_full_and_subsets( remote, orig = adata_remote_orig - maybe_warning_context = ( + with ( pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric") if not load_annotation_index else nullcontext() - ) - with maybe_warning_context: + ): remote_concatenated = ad.concat([remote, remote], join=join) if index is not None: if np.isscalar(index) and index == "a": From e48377a435a19bea20f775479bfe7ec12d52084c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 13:58:01 +0100 Subject: [PATCH 338/348] (chore): restore types --- src/anndata/tests/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index d06887ede..1e69e5b70 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1086,13 +1086,13 @@ def get_accessed_keys(self, key: str) -> list[str]: raise KeyError(f"{key} not found among accessed keys") return self._accessed_keys[key] - def initialize_key_trackers(self, keys_to_track: Collection[str]): + def initialize_key_trackers(self, keys_to_track: Collection[str]) -> None: for k in keys_to_track: self._access_count[k] = 0 self._accessed_keys[k] = [] self._accessed[k] = set() - def reset_key_trackers(self): + def reset_key_trackers(self) -> None: self.initialize_key_trackers(self._access_count.keys()) def assert_access_count(self, key: str, count: int): From 90f6d7724652dd70d8954cdb42962c70f33b252b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 14:30:57 +0100 Subject: [PATCH 339/348] (fix): do `randint` --- docs/tutorials/notebooks | 2 +- tests/test_read_lazy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 0af6cf336..9e186c5c6 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 0af6cf3363aed1cafd317516c8393136ee6287ae +Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index f4ed92038..9fa78931d 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -558,7 +558,7 @@ def test_concat_to_memory_var( id="consecutive integer array", ), pytest.param( - np.random.choice(np.arange(800, 1100), 500), + np.random.randint(800, 1100, 500), id="random integer array", ), pytest.param( From 8e29713caba1806d0d4434d906c09aa209d3b13e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 14:34:22 +0100 Subject: [PATCH 340/348] (chore): ermove slots check --- src/anndata/experimental/backed/_lazy_arrays.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 6495c4545..db783c094 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -28,10 +28,6 @@ class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): def __init__(self, array: K): if isinstance(array, ZarrArray): return super().__init__(array) - if set(self.__slots__) != {"dtype", "shape", "_array"}: - msg = "Expected attributes of xarray ZarrArrayWrapper have changed - " - "please file an issue with anndata and consider downgrading xarray" - raise AssertionError(msg) self._array = array self.shape = self._array.shape self.dtype = self._array.dtype From f13bfb494d64d68c157ded5550d8484a2e9325a1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 31 Oct 2024 14:39:48 +0100 Subject: [PATCH 341/348] (fix): return read_lazy --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9e186c5c6..0af6cf336 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9e186c5c694793bb04ea1397721d154d6e0b7069 +Subproject commit 0af6cf3363aed1cafd317516c8393136ee6287ae From 1643da63c06995100671b3b93a56a39423ac4535 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 6 Nov 2024 17:23:43 +0100 Subject: [PATCH 342/348] (fix): concating with hdf5 and cluster obviates need for locks works with normal pytest --- src/anndata/_core/merge.py | 32 ++++++++++++++++--- src/anndata/_io/specs/lazy_methods.py | 18 ++++++++++- src/anndata/experimental/backed/_io.py | 19 ++++++----- .../experimental/backed/_lazy_arrays.py | 20 ++++++++++-- tests/test_read_lazy.py | 15 ++++++++- 5 files changed, 86 insertions(+), 18 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 9e701184e..086880bed 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1094,13 +1094,35 @@ def make_dask_col_from_extension_dtype( """ import dask.array as da - from anndata._io.specs.lazy_methods import compute_chunk_layout_for_axis_size + from anndata._io.specs.lazy_methods import ( + compute_chunk_layout_for_axis_size, + maybe_open_h5, + ) + from anndata.experimental import read_lazy + from anndata.experimental.backed._compat import DataArray + from anndata.experimental.backed._compat import xarray as xr + + base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group") + elem_name = col.attrs.get("elem_name") + dims = col.dims + coords = col.coords.copy() def get_chunk(block_info=None): - idx = tuple( - slice(start, stop) for start, stop in block_info[None]["array-location"] - ) - return np.array(col.data[idx].array) + with maybe_open_h5(base_path_or_zarr_group, elem_name) as f: + v = read_lazy(f) + variable = xr.Variable( + data=xr.core.indexing.LazilyIndexedArray(v), dims=dims + ) + data_array = DataArray( + variable, + coords=coords, + dims=dims, + ) + idx = tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + chunk = np.array(data_array.data[idx].array) + return chunk if col.dtype == "category" or use_only_object_dtype: dtype = "object" diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 47454e6fd..4c0583577 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -212,6 +212,10 @@ def _gen_xarray_dict_iterator_from_elems( coords=[index], dims=[index_label], name=k, + attrs={ + "base_path_or_zarr_group": v.base_path_or_zarr_group, + "elem_name": v.elem_name, + }, ) elif k == index_key: data_array = DataArray( @@ -278,25 +282,37 @@ def read_categorical( ) -> CategoricalArray: from anndata.experimental.backed._lazy_arrays import CategoricalArray + base_path_or_zarr_group = ( + Path(filename(elem)) if isinstance(elem, H5Group) else elem + ) + elem_name = get_elem_name(elem) return CategoricalArray( codes=elem["codes"], categories=elem["categories"], ordered=elem.attrs["ordered"], + base_path_or_zarr_group=base_path_or_zarr_group, + elem_name=elem_name, ) def read_nullable( elem: H5Group | ZarrGroup, *, - encoding_type: str, + encoding_type: Literal["nullable-integer", "nullable-boolean"], _reader: LazyReader, ) -> MaskedArray: from anndata.experimental.backed._lazy_arrays import MaskedArray + base_path_or_zarr_group = ( + Path(filename(elem)) if isinstance(elem, H5Group) else elem + ) + elem_name = get_elem_name(elem) return MaskedArray( values=elem["values"], mask=elem["mask"] if "mask" in elem else None, dtype_str=encoding_type, + base_path_or_zarr_group=base_path_or_zarr_group, + elem_name=elem_name, ) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 343248fca..4ef86d7bd 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -97,7 +97,7 @@ def read_lazy( raise ImportError( "xarray is required to use the `read_lazy` function. Please install xarray." ) - is_h5_store = isinstance(store, h5py.Dataset | h5py.File) + is_h5_store = isinstance(store, h5py.Dataset | h5py.File | h5py.Group) is_h5 = ( isinstance(store, Path | str) and Path(store).suffix == ".h5ad" ) or is_h5_store @@ -106,13 +106,16 @@ def read_lazy( if not is_h5: import zarr - try: - f = zarr.open_consolidated(store, mode="r") - except KeyError: - msg = "Did not read zarr as consolidated. Consider consolidating your metadata." - warnings.warn(msg) - has_keys = False - f = zarr.open(store, mode="r") + if not isinstance(store, zarr.hierarchy.Group): + try: + f = zarr.open_consolidated(store, mode="r") + except KeyError: + msg = "Did not read zarr as consolidated. Consider consolidating your metadata." + warnings.warn(msg) + has_keys = False + f = zarr.open(store, mode="r") + else: + f = store else: if is_h5_store: f = store diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index db783c094..6ee4eb404 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -14,11 +14,13 @@ from ._compat import xarray as xr if TYPE_CHECKING: + from pathlib import Path from typing import Literal import numpy as np from anndata._core.index import Index + from anndata.compat import ZarrGroup K = TypeVar("K", H5Array, ZarrArray) @@ -35,8 +37,6 @@ def __init__(self, array: K): def __getitem__(self, key: xr.core.indexing.ExplicitIndexer): if isinstance(self._array, ZarrArray): return super().__getitem__(key) - # adapted from https://github.com/pydata/xarray/blob/main/xarray/backends/h5netcdf_.py#L50-L58C13 - # TODO: locks? return xr.core.indexing.explicit_indexing_adapter( key, self.shape, @@ -49,12 +49,16 @@ class CategoricalArray(BackendArray, Generic[K]): _codes: ZarrOrHDF5Wrapper[K] _categories: ZarrArray | H5Array shape: tuple[int, ...] + base_path_or_zarr_group: Path | ZarrGroup + elem_name: str def __init__( self, codes: K, categories: ZarrArray | H5Array, ordered: bool, + base_path_or_zarr_group: Path | ZarrGroup, + elem_name: str, *args, **kwargs, ): @@ -62,6 +66,9 @@ def __init__( self._ordered = ordered self._codes = ZarrOrHDF5Wrapper(codes) self.shape = self._codes.shape + self.base_path_or_zarr_group = base_path_or_zarr_group + self.file_format = "zarr" if isinstance(codes, ZarrArray) else "h5" + self.elem_name = elem_name @cached_property def categories(self) -> np.ndarray: @@ -92,17 +99,24 @@ class MaskedArray(BackendArray, Generic[K]): _values: ZarrOrHDF5Wrapper[K] _dtype_str: Literal["nullable-integer", "nullable-boolean"] shape: tuple[int, ...] + base_path_or_zarr_group: Path | ZarrGroup + elem_name: str def __init__( self, values: ZarrArray | H5Array, dtype_str: Literal["nullable-integer", "nullable-boolean"], - mask: ZarrArray | H5Array | None = None, + mask: ZarrArray | H5Array, + base_path_or_zarr_group: Path | ZarrGroup, + elem_name: str, ): self._mask = ZarrOrHDF5Wrapper(mask) self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str self.shape = self._values.shape + self.base_path_or_zarr_group = base_path_or_zarr_group + self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5" + self.elem_name = elem_name def __getitem__( self, key: xr.core.indexing.ExplicitIndexer diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 9fa78931d..6c2f14076 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -546,6 +546,19 @@ def test_concat_to_memory_var( stores_for_concat[store_idx].reset_key_trackers() +def test_concat_data_with_cluster_to_memory( + adata_remote: AnnData, + join: Join_T, +): + import dask.distributed as dd + + with ( + dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, + dd.Client(cluster), + ): + ad.concat([adata_remote, adata_remote], join=join).to_memory() + + @pytest.mark.parametrize( "index", [ @@ -570,7 +583,7 @@ def test_concat_to_memory_var( pytest.param(None, id="No index"), ], ) -def test_concat_full_and_subsets( +def test_concat_data( adata_remote: AnnData, adata_orig: AnnData, join: Join_T, From 501382334712fd87fe7288c5fc4b68f377e6e8af Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 7 Nov 2024 14:19:07 +0100 Subject: [PATCH 343/348] (fix): add warning catch --- tests/test_read_lazy.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_read_lazy.py b/tests/test_read_lazy.py index 6c2f14076..9603d17c4 100644 --- a/tests/test_read_lazy.py +++ b/tests/test_read_lazy.py @@ -547,8 +547,7 @@ def test_concat_to_memory_var( def test_concat_data_with_cluster_to_memory( - adata_remote: AnnData, - join: Join_T, + adata_remote: AnnData, join: Join_T, load_annotation_index: bool ): import dask.distributed as dd @@ -556,7 +555,12 @@ def test_concat_data_with_cluster_to_memory( dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, dd.Client(cluster), ): - ad.concat([adata_remote, adata_remote], join=join).to_memory() + with ( + pytest.warns(UserWarning, match=r"Concatenating with a pandas numeric") + if not load_annotation_index + else nullcontext() + ): + ad.concat([adata_remote, adata_remote], join=join).to_memory() @pytest.mark.parametrize( From dee82a2b33fd7cfb894625cecd9c492adcac5fe0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 28 Nov 2024 16:11:35 +0100 Subject: [PATCH 344/348] (chore): good chunk size --- src/anndata/_core/merge.py | 17 ++++++++++++----- src/anndata/_io/specs/lazy_methods.py | 9 ++++++++- src/anndata/experimental/backed/_lazy_arrays.py | 12 ++++++++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 086880bed..44a5357f6 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1096,9 +1096,10 @@ def make_dask_col_from_extension_dtype( from anndata._io.specs.lazy_methods import ( compute_chunk_layout_for_axis_size, + get_chunksize, maybe_open_h5, ) - from anndata.experimental import read_lazy + from anndata.experimental import read_elem_lazy from anndata.experimental.backed._compat import DataArray from anndata.experimental.backed._compat import xarray as xr @@ -1106,10 +1107,17 @@ def make_dask_col_from_extension_dtype( elem_name = col.attrs.get("elem_name") dims = col.dims coords = col.coords.copy() + with maybe_open_h5(base_path_or_zarr_group, elem_name) as f: + maybe_chunk_size = get_chunksize(read_elem_lazy(f)) + chunk_size = ( + compute_chunk_layout_for_axis_size( + 1000 if maybe_chunk_size is None else maybe_chunk_size[0], col.shape[0] + ), + ) def get_chunk(block_info=None): with maybe_open_h5(base_path_or_zarr_group, elem_name) as f: - v = read_lazy(f) + v = read_elem_lazy(f) variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=dims ) @@ -1128,10 +1136,9 @@ def get_chunk(block_info=None): dtype = "object" else: dtype = col.dtype.numpy_dtype - # TODO: get good chunk size? return da.map_blocks( get_chunk, - chunks=(compute_chunk_layout_for_axis_size(1000, col.shape[0]),), + chunks=chunk_size, meta=np.array([], dtype=dtype), dtype=dtype, ) @@ -1185,7 +1192,7 @@ def get_attrs(annotations: Iterable[Dataset2D]) -> dict: """ index_names = np.unique([a.index.name for a in annotations]) assert len(index_names) == 1, "All annotations must have the same index name." - if any(a.index.dtype == "int64" for a in annotations): + if any(np.issubdtype(a.index.dtype, np.integer) for a in annotations): msg = "Concatenating with a pandas numeric index among the indices. Index may likely not be unique." warn(msg, UserWarning) index_keys = [ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 27dc4c992..c247d304d 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -2,7 +2,7 @@ import re from contextlib import contextmanager -from functools import partial +from functools import partial, singledispatch from pathlib import Path from typing import TYPE_CHECKING, overload @@ -92,6 +92,13 @@ def make_dask_chunk( return chunk +@singledispatch +def get_chunksize(obj) -> tuple[int, ...]: + if hasattr(obj, "chunks"): + return obj.chunks + raise ValueError("object of type {type(obj)} has no recognized chunks") + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 6ee4eb404..e928847c1 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -7,6 +7,7 @@ from anndata._core.index import _subset from anndata._core.views import as_view +from anndata._io.specs.lazy_methods import get_chunksize from anndata.compat import H5Array, ZarrArray from ..._settings import settings @@ -28,6 +29,7 @@ class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): def __init__(self, array: K): + self.chunks = array.chunks if isinstance(array, ZarrArray): return super().__init__(array) self._array = array @@ -152,3 +154,13 @@ def _subset_masked(a: DataArray, subset_idx: Index): @as_view.register(DataArray) def _view_pd_boolean_array(a: DataArray, view_args): return a + + +@get_chunksize.register(MaskedArray) +def _(a: MaskedArray): + return get_chunksize(a._values) + + +@get_chunksize.register(CategoricalArray) +def _(a: CategoricalArray): + return get_chunksize(a._codes) From 843cae8f6f9cecb47a25f08551a98980f0cdc3fb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 28 Nov 2024 16:13:58 +0100 Subject: [PATCH 345/348] (chore): remove duplicated line --- src/anndata/_core/merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 44a5357f6..f817c06ae 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1226,7 +1226,6 @@ def concat_dataset2d_on_annot_axis( annotations_with_only_dask = list(make_xarray_extension_dtypes_dask(annotations)) attrs = get_attrs(annotations_with_only_dask) - index_name = np.unique([a.index.name for a in annotations])[0] [index_name] = {a.index.name for a in annotations} return Dataset2D( xr.concat(annotations_with_only_dask, join=join, dim=index_name), attrs=attrs From fa41e354e66268f294794ac6a9e75198b8eee1f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 28 Nov 2024 16:25:41 +0100 Subject: [PATCH 346/348] (chore): use a simpler read mechanism for remote io --- src/anndata/experimental/backed/_io.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 4ef86d7bd..ba639ec41 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -50,7 +50,7 @@ def read_lazy( Preparing example objects >>> import anndata as ad - >>> import httpx + >>> from urllib.request import urlretrieve >>> import scanpy as sc >>> base_url = "https://datasets.cellxgene.cziscience.com" >>> def get_cellxgene_data(id_: str): @@ -59,10 +59,7 @@ def read_lazy( ... return out_path ... file_url = f"{base_url}/{id_}.h5ad" ... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True) - ... with httpx.stream("GET", file_url) as r, out_path.open("wb") as f: - ... r.raise_for_status() - ... for data in r.iter_bytes(): - ... f.write(data) + ... urlretrieve(file_url, out_path) ... return out_path >>> path_b_cells = get_cellxgene_data("a93eab58-3d82-4b61-8a2f-d7666dcdb7c4") >>> path_fetal = get_cellxgene_data("d170ff04-6da0-4156-a719-f8e1bbefbf53") From 1ab575cc1ac9070220e5cfd5727d4603fc6afea6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 28 Nov 2024 16:33:24 +0100 Subject: [PATCH 347/348] (fix): docs --- docs/release-notes/0.11.0.md | 2 +- docs/release-notes/0.11.1.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index 317175f50..d79b3aebc 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -40,7 +40,7 @@ Release candidates: - {guilabel}`rc1` Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) - {guilabel}`rc1` Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) - {guilabel}`rc1` Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) -- {guilabel}`rc1` Add {func}`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) +- {guilabel}`rc1` Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) - {guilabel}`rc1` Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) - {guilabel}`rc1` Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) - {guilabel}`rc1` Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) diff --git a/docs/release-notes/0.11.1.md b/docs/release-notes/0.11.1.md index 8725ecf23..820b9a729 100644 --- a/docs/release-notes/0.11.1.md +++ b/docs/release-notes/0.11.1.md @@ -4,5 +4,5 @@ ### Bug fixes - Remove upper pin on `dask` and exclude versions broken with sparse indexing {user}`ilan-gold` ({pr}`1725`) -- Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` ({pr}`1743`) +- Fix chunking with -1 in `chunks` argument of `~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` ({pr}`1743`) - Fix `cupy<0.13` imports in non-gpu environments {user}`ilan-gold` ({pr}`1754`) From 0c777efd66b75e20aa9633131b339293d72dec8f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 28 Nov 2024 16:33:39 +0100 Subject: [PATCH 348/348] (chore): docs note --- src/anndata/experimental/backed/_lazy_arrays.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index e928847c1..89374c851 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -48,6 +48,12 @@ def __getitem__(self, key: xr.core.indexing.ExplicitIndexer): class CategoricalArray(BackendArray, Generic[K]): + """ + A wrapper class meant to enable working with lazy categorical data. + We do not guarantee the stability of this API beyond that guaranteed + by :class:`xarray.backends.BackendArray`. + """ + _codes: ZarrOrHDF5Wrapper[K] _categories: ZarrArray | H5Array shape: tuple[int, ...] @@ -97,6 +103,12 @@ def dtype(self): class MaskedArray(BackendArray, Generic[K]): + """ + A wrapper class meant to enable working with lazy masked data. + We do not guarantee the stability of this API beyond that guaranteed + by :class:`xarray.backends.BackendArray`. + """ + _mask: ZarrOrHDF5Wrapper[K] _values: ZarrOrHDF5Wrapper[K] _dtype_str: Literal["nullable-integer", "nullable-boolean"]