From c93b31a9175eed9e506eb1950bf843d7de715bb9 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 30 Nov 2023 22:58:54 -0500 Subject: [PATCH] Add mode='a-': Do not overwrite coordinates when appending to Zarr with `append_dim` (#8428) Co-authored-by: Deepak Cherian Co-authored-by: Anderson Banihirwe <13301940+andersy005@users.noreply.github.com> --- doc/whats-new.rst | 4 ++-- xarray/backends/api.py | 22 ++++++++++++---------- xarray/backends/zarr.py | 21 ++++++++++++++++++--- xarray/core/dataarray.py | 18 ++++++++++++------ xarray/core/dataset.py | 12 +++++++----- xarray/core/types.py | 3 +++ xarray/tests/test_backends.py | 25 ++++++++++++++++++++++++- 7 files changed, 78 insertions(+), 27 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abc3760df6e..817ea2c8235 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,8 +25,8 @@ New Features - Use a concise format when plotting datetime arrays. (:pull:`8449`). By `Jimmy Westling `_. - - +- Avoid overwriting unchanged existing coordinate variables when appending by setting ``mode='a-'``. + By `Ryan Abernathey `_ and `Deepak Cherian `_. - :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming the core dim has exactly one chunk. (:pull:`8475`). By `Maximilian Roos `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3e6d00a8059..c59f2f8d81b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -39,6 +39,7 @@ from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk from xarray.core.indexes import Index from xarray.core.parallelcompat import guess_chunkmanager +from xarray.core.types import ZarrWriteModes from xarray.core.utils import is_remote_uri if TYPE_CHECKING: @@ -69,7 +70,6 @@ "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC" ] - DATAARRAY_NAME = "__xarray_dataarray_name__" DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" @@ -1577,7 +1577,7 @@ def to_zarr( dataset: Dataset, store: MutableMapping | str | os.PathLike[str] | None = None, chunk_store: MutableMapping | str | os.PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -1601,7 +1601,7 @@ def to_zarr( dataset: Dataset, store: MutableMapping | str | os.PathLike[str] | None = None, chunk_store: MutableMapping | str | os.PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -1623,7 +1623,7 @@ def to_zarr( dataset: Dataset, store: MutableMapping | str | os.PathLike[str] | None = None, chunk_store: MutableMapping | str | os.PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -1680,16 +1680,18 @@ def to_zarr( else: mode = "w-" - if mode != "a" and append_dim is not None: + if mode not in ["a", "a-"] and append_dim is not None: raise ValueError("cannot set append_dim unless mode='a' or mode=None") - if mode not in ["a", "r+"] and region is not None: - raise ValueError("cannot set region unless mode='a', mode='r+' or mode=None") + if mode not in ["a", "a-", "r+"] and region is not None: + raise ValueError( + "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" + ) - if mode not in ["w", "w-", "a", "r+"]: + if mode not in ["w", "w-", "a", "a-", "r+"]: raise ValueError( "The only supported options for mode are 'w', " - f"'w-', 'a' and 'r+', but mode={mode!r}" + f"'w-', 'a', 'a-', and 'r+', but mode={mode!r}" ) # validate Dataset keys, DataArray names @@ -1745,7 +1747,7 @@ def to_zarr( write_empty=write_empty_chunks, ) - if mode in ["a", "r+"]: + if mode in ["a", "a-", "r+"]: _validate_datatypes_for_zarr_append(zstore, dataset) if append_dim is not None: existing_dims = zstore.get_dimensions() diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c437c42183a..469bbf4c339 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -21,6 +21,7 @@ from xarray.core import indexing from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.pycompat import integer_types +from xarray.core.types import ZarrWriteModes from xarray.core.utils import ( FrozenDict, HiddenKeyDict, @@ -385,7 +386,7 @@ class ZarrStore(AbstractWritableDataStore): def open_group( cls, store, - mode="r", + mode: ZarrWriteModes = "r", synchronizer=None, group=None, consolidated=False, @@ -410,7 +411,8 @@ def open_group( zarr_version = getattr(store, "_store_version", 2) open_kwargs = dict( - mode=mode, + # mode='a-' is a handcrafted xarray specialty + mode="a" if mode == "a-" else mode, synchronizer=synchronizer, path=group, ) @@ -639,8 +641,21 @@ def store( self.set_attributes(attributes) self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims) + # if we are appending to an append_dim, only write either + # - new variables not already present, OR + # - variables with the append_dim in their dimensions + # We do NOT overwrite other variables. + if self._mode == "a-" and self._append_dim is not None: + variables_to_set = { + k: v + for k, v in variables_encoded.items() + if (k not in existing_variable_names) or (self._append_dim in v.dims) + } + else: + variables_to_set = variables_encoded + self.set_variables( - variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims + variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims ) if self._consolidate_on_close: zarr.consolidate_metadata(self.zarr_group.store) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bac4ad36adb..935eff9fb18 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -49,7 +49,12 @@ from xarray.core.indexing import is_fancy_indexer, map_index_queries from xarray.core.merge import PANDAS_TYPES, MergeError from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.types import DaCompatible, T_DataArray, T_DataArrayOrSet +from xarray.core.types import ( + DaCompatible, + T_DataArray, + T_DataArrayOrSet, + ZarrWriteModes, +) from xarray.core.utils import ( Default, HybridMappingProxy, @@ -4074,7 +4079,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, *, @@ -4095,7 +4100,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -4114,7 +4119,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -4150,10 +4155,11 @@ def to_zarr( chunk_store : MutableMapping, str or path-like, optional Store or path to directory in local or remote file system only for Zarr array chunks. Requires zarr-python v2.4.0 or later. - mode : {"w", "w-", "a", "r+", None}, optional + mode : {"w", "w-", "a", "a-", r+", None}, optional Persistence mode: "w" means create (overwrite if exists); "w-" means create (fail if exists); - "a" means override existing variables (create if does not exist); + "a" means override all existing variables including dimension coordinates (create if does not exist); + "a-" means only append those variables that have ``append_dim``. "r+" means modify existing array *values* only (raise an error if any metadata or shapes would change). The default mode is "a" if ``append_dim`` is set. Otherwise, it is diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 66c83e95b77..c65bbd6b849 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -100,6 +100,7 @@ T_Chunks, T_DataArrayOrSet, T_Dataset, + ZarrWriteModes, ) from xarray.core.utils import ( Default, @@ -2305,7 +2306,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -2328,7 +2329,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -2349,7 +2350,7 @@ def to_zarr( self, store: MutableMapping | str | PathLike[str] | None = None, chunk_store: MutableMapping | str | PathLike | None = None, - mode: Literal["w", "w-", "a", "r+", None] = None, + mode: ZarrWriteModes | None = None, synchronizer=None, group: str | None = None, encoding: Mapping | None = None, @@ -2387,10 +2388,11 @@ def to_zarr( chunk_store : MutableMapping, str or path-like, optional Store or path to directory in local or remote file system only for Zarr array chunks. Requires zarr-python v2.4.0 or later. - mode : {"w", "w-", "a", "r+", None}, optional + mode : {"w", "w-", "a", "a-", r+", None}, optional Persistence mode: "w" means create (overwrite if exists); "w-" means create (fail if exists); - "a" means override existing variables (create if does not exist); + "a" means override all existing variables including dimension coordinates (create if does not exist); + "a-" means only append those variables that have ``append_dim``. "r+" means modify existing array *values* only (raise an error if any metadata or shapes would change). The default mode is "a" if ``append_dim`` is set. Otherwise, it is diff --git a/xarray/core/types.py b/xarray/core/types.py index 1be5b00c43f..90f0f94e679 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -282,3 +282,6 @@ def copy( "midpoint", "nearest", ] + + +ZarrWriteModes = Literal["w", "w-", "a", "a-", "r+", "r"] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d60daefc728..062f5de7d20 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2390,6 +2390,29 @@ def test_append_with_new_variable(self) -> None: xr.open_dataset(store_target, engine="zarr", **self.version_kwargs), ) + def test_append_with_append_dim_no_overwrite(self) -> None: + ds, ds_to_append, _ = create_append_test_data() + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w", **self.version_kwargs) + original = xr.concat([ds, ds_to_append], dim="time") + original2 = xr.concat([original, ds_to_append], dim="time") + + # overwrite a coordinate; + # for mode='a-', this will not get written to the store + # because it does not have the append_dim as a dim + ds_to_append.lon.data[:] = -999 + ds_to_append.to_zarr( + store_target, mode="a-", append_dim="time", **self.version_kwargs + ) + actual = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) + assert_identical(original, actual) + + # by default, mode="a" will overwrite all coordinates. + ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) + actual = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs) + original2.lon.data[:] = -999 + assert_identical(original2, actual) + @requires_dask def test_to_zarr_compute_false_roundtrip(self) -> None: from dask.delayed import Delayed @@ -2586,7 +2609,7 @@ def setup_and_verify_store(expected=data): with pytest.raises( ValueError, match=re.escape( - "cannot set region unless mode='a', mode='r+' or mode=None" + "cannot set region unless mode='a', mode='a-', mode='r+' or mode=None" ), ): data.to_zarr(