From e63d15c4c9569abee38417cd60caca27cd334053 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Mon, 8 Jul 2024 15:24:57 +0200 Subject: [PATCH 01/14] allow per-variable choice of mask_and_scale in open_dataset --- xarray/backends/api.py | 8 +++++--- xarray/conventions.py | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 521bdf65e6a..a008fae9394 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -398,7 +398,7 @@ def open_dataset( chunks: T_Chunks = None, cache: bool | None = None, decode_cf: bool | None = None, - mask_and_scale: bool | None = None, + mask_and_scale: bool | dict[str, bool] | None = None, decode_times: bool | None = None, decode_timedelta: bool | None = None, use_cftime: bool | None = None, @@ -451,14 +451,16 @@ def open_dataset( decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. - mask_and_scale : bool, optional + mask_and_scale : bool or mapping from variable name to bool, optional If True, replace array values equal to `_FillValue` with NA and scale values according to the formula `original_values * scale_factor + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are taken from variable attributes (if they exist). If the `_FillValue` or `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will - be replaced by NA. This keyword may not be supported by all the backends. + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. diff --git a/xarray/conventions.py b/xarray/conventions.py index 6eff45c5b2d..dd46710bf60 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -384,11 +384,18 @@ def _update_bounds_encoding(variables: T_Variables) -> None: bounds_encoding.setdefault("calendar", encoding["calendar"]) +def _item_or_default(obj: Mapping | Any, key: Hashable, default: Any = None): + """ + Return item by key if obj is mapping and key is present, else return default value. + """ + return (obj.get(key, default) if isinstance(obj, Mapping) else obj) + + def decode_cf_variables( variables: T_Variables, attributes: T_Attrs, concat_characters: bool = True, - mask_and_scale: bool = True, + mask_and_scale: bool | dict[str, bool] = True, decode_times: bool = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, @@ -441,7 +448,7 @@ def stackable(dim: Hashable) -> bool: k, v, concat_characters=concat_characters, - mask_and_scale=mask_and_scale, + mask_and_scale=_item_or_default(mask_and_scale, k, True), decode_times=decode_times, stack_char_dim=stack_char_dim, use_cftime=use_cftime, From 4bbb924b69292b43502349648e15b869d105ecdf Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Mon, 8 Jul 2024 15:37:48 +0200 Subject: [PATCH 02/14] simplify docstring datatype --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a008fae9394..5e3b636e8e3 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -451,7 +451,7 @@ def open_dataset( decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. - mask_and_scale : bool or mapping from variable name to bool, optional + mask_and_scale : bool or dict, optional If True, replace array values equal to `_FillValue` with NA and scale values according to the formula `original_values * scale_factor + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are From fa90e7fb1ca2db2f4a3ff156bb40303f559c153c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:43:02 +0000 Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index dd46710bf60..45b2284bb97 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -388,7 +388,7 @@ def _item_or_default(obj: Mapping | Any, key: Hashable, default: Any = None): """ Return item by key if obj is mapping and key is present, else return default value. """ - return (obj.get(key, default) if isinstance(obj, Mapping) else obj) + return obj.get(key, default) if isinstance(obj, Mapping) else obj def decode_cf_variables( From b218b3c70a050dc3c01636c628a50ca20bb453f8 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 00:39:54 +0200 Subject: [PATCH 04/14] dict -> Mapping in type annotation Co-authored-by: Michael Niklas --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 5e3b636e8e3..3ef7f3c06e6 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -398,7 +398,7 @@ def open_dataset( chunks: T_Chunks = None, cache: bool | None = None, decode_cf: bool | None = None, - mask_and_scale: bool | dict[str, bool] | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, decode_times: bool | None = None, decode_timedelta: bool | None = None, use_cftime: bool | None = None, From 72483cbca856806e47133712c56527a2f4050803 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 00:46:33 +0200 Subject: [PATCH 05/14] use typevar for _item_or_default annotation Otherwise you lose all typing when you use that because it returns Any. --- xarray/conventions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 45b2284bb97..acbba4b839d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -from typing import TYPE_CHECKING, Any, Literal, Union +from typing import TYPE_CHECKING, Any, Literal, Union, TypeVar import numpy as np import pandas as pd @@ -383,8 +383,8 @@ def _update_bounds_encoding(variables: T_Variables) -> None: if "calendar" in encoding: bounds_encoding.setdefault("calendar", encoding["calendar"]) - -def _item_or_default(obj: Mapping | Any, key: Hashable, default: Any = None): +T = TypeVar("T") +def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T = None) -> T: """ Return item by key if obj is mapping and key is present, else return default value. """ From a47363cf94e3f2d74fe2e2d2b960f0f3064e1e24 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 22:47:07 +0000 Subject: [PATCH 06/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index acbba4b839d..f92c2691943 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -from typing import TYPE_CHECKING, Any, Literal, Union, TypeVar +from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np import pandas as pd @@ -383,7 +383,10 @@ def _update_bounds_encoding(variables: T_Variables) -> None: if "calendar" in encoding: bounds_encoding.setdefault("calendar", encoding["calendar"]) + T = TypeVar("T") + + def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T = None) -> T: """ Return item by key if obj is mapping and key is present, else return default value. From 46de50f52d0f9f79d752898a86e8fefe825177f8 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 01:06:42 +0200 Subject: [PATCH 07/14] implement feature for 4 additional parameters --- xarray/backends/api.py | 18 +++++++++++++----- xarray/conventions.py | 20 ++++++++++---------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3ef7f3c06e6..4727dcefb77 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -399,10 +399,10 @@ def open_dataset( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | None = None, - decode_timedelta: bool | None = None, - use_cftime: bool | None = None, - concat_characters: bool | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, @@ -464,12 +464,16 @@ def open_dataset( decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar @@ -480,12 +484,16 @@ def open_dataset( ``cftime.datetime`` objects, regardless of whether or not they can be represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: diff --git a/xarray/conventions.py b/xarray/conventions.py index f92c2691943..2f20cc4c2a4 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -397,13 +397,13 @@ def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T = None) def decode_cf_variables( variables: T_Variables, attributes: T_Attrs, - concat_characters: bool = True, - mask_and_scale: bool | dict[str, bool] = True, - decode_times: bool = True, + concat_characters: bool | Mapping[str, bool] = True, + mask_and_scale: bool | Mapping[str, bool] = True, + decode_times: bool | Mapping[str, bool] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, - use_cftime: bool | None = None, - decode_timedelta: bool | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, ) -> tuple[T_Variables, T_Attrs, set[Hashable]]: """ Decode several CF encoded variables. @@ -441,7 +441,7 @@ def stackable(dim: Hashable) -> bool: if k in drop_variables: continue stack_char_dim = ( - concat_characters + _item_or_default(concat_characters, k, True) and v.dtype == "S1" and v.ndim > 0 and stackable(v.dims[-1]) @@ -450,12 +450,12 @@ def stackable(dim: Hashable) -> bool: new_vars[k] = decode_cf_variable( k, v, - concat_characters=concat_characters, + concat_characters=_item_or_default(concat_characters, k, True), mask_and_scale=_item_or_default(mask_and_scale, k, True), - decode_times=decode_times, + decode_times=_item_or_default(decode_times, k, True), stack_char_dim=stack_char_dim, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + use_cftime=_item_or_default(use_cftime, k, True), + decode_timedelta=_item_or_default(decode_timedelta, k, True), ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e From 28e01ab5972ae9dafd68f15691db4dd15808b0c2 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 01:13:42 +0200 Subject: [PATCH 08/14] fix default value inconsistency --- xarray/conventions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 2f20cc4c2a4..d2016c6cf67 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -454,8 +454,8 @@ def stackable(dim: Hashable) -> bool: mask_and_scale=_item_or_default(mask_and_scale, k, True), decode_times=_item_or_default(decode_times, k, True), stack_char_dim=stack_char_dim, - use_cftime=_item_or_default(use_cftime, k, True), - decode_timedelta=_item_or_default(decode_timedelta, k, True), + use_cftime=_item_or_default(use_cftime, k), + decode_timedelta=_item_or_default(decode_timedelta, k), ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e From a194d0d1a2ed5cf074600648e437aa645351c416 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 12:05:27 +0200 Subject: [PATCH 09/14] add what's new + None annotation --- doc/whats-new.rst | 3 +++ xarray/conventions.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6a8e898c93c..4d4291e050e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,9 @@ v2024.06.1 (unreleased) New Features ~~~~~~~~~~~~ +- Allow per-variable specification of ``mask_and_scale``, ``decode_times``, ``decode_timedelta`` + ``use_cftime`` and ``concat_characters`` params in :py:func:`~xarray.open_dataset` (:pull:`9218`). + By `Mathijs Verhaegh `_. - Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). By `Martin Raspaud `_. - Extract the source url from fsspec objects (:issue:`9142`, :pull:`8923`). diff --git a/xarray/conventions.py b/xarray/conventions.py index d2016c6cf67..50d066f1f7c 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -387,7 +387,7 @@ def _update_bounds_encoding(variables: T_Variables) -> None: T = TypeVar("T") -def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T = None) -> T: +def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None) -> T: """ Return item by key if obj is mapping and key is present, else return default value. """ From 0f1e4328346d8a0e33e112bbf23de9cd8d358f67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:06:17 +0000 Subject: [PATCH 10/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 50d066f1f7c..76171c060dd 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -387,7 +387,9 @@ def _update_bounds_encoding(variables: T_Variables) -> None: T = TypeVar("T") -def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None) -> T: +def _item_or_default( + obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None +) -> T: """ Return item by key if obj is mapping and key is present, else return default value. """ From 5eecad1df28ca56ba9db11dfd3ef6745ca0f2932 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 12:47:20 +0200 Subject: [PATCH 11/14] _item_or_default return type T | None --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 50d066f1f7c..89ea3017ef5 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -387,7 +387,7 @@ def _update_bounds_encoding(variables: T_Variables) -> None: T = TypeVar("T") -def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None) -> T: +def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None) -> T | None: """ Return item by key if obj is mapping and key is present, else return default value. """ From bc664f6ef90618eb6d5a965125aec3c2097a0604 Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Sat, 13 Jul 2024 13:11:55 +0200 Subject: [PATCH 12/14] remove deault default value _item_or_default --- xarray/conventions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index a09e9d278f6..ba495637d73 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -388,8 +388,8 @@ def _update_bounds_encoding(variables: T_Variables) -> None: def _item_or_default( - obj: Mapping[Any, T] | T, key: Hashable, default: T | None = None -) -> T | None: + obj: Mapping[Any, T] | T, key: Hashable, default: T +) -> T: """ Return item by key if obj is mapping and key is present, else return default value. """ @@ -456,8 +456,8 @@ def stackable(dim: Hashable) -> bool: mask_and_scale=_item_or_default(mask_and_scale, k, True), decode_times=_item_or_default(decode_times, k, True), stack_char_dim=stack_char_dim, - use_cftime=_item_or_default(use_cftime, k), - decode_timedelta=_item_or_default(decode_timedelta, k), + use_cftime=_item_or_default(use_cftime, k, None), + decode_timedelta=_item_or_default(decode_timedelta, k, None), ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e From 9e027288a95dc031b8eb3f08aaef4434899cfa52 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 13 Jul 2024 11:12:33 +0000 Subject: [PATCH 13/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index ba495637d73..ff1256883ba 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -387,9 +387,7 @@ def _update_bounds_encoding(variables: T_Variables) -> None: T = TypeVar("T") -def _item_or_default( - obj: Mapping[Any, T] | T, key: Hashable, default: T -) -> T: +def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T) -> T: """ Return item by key if obj is mapping and key is present, else return default value. """ From 5762046bd838e10819fbd4cee37834df8a6a3b7b Mon Sep 17 00:00:00 2001 From: Mathijs Verhaegh Date: Tue, 16 Jul 2024 06:25:04 +0200 Subject: [PATCH 14/14] docstring dtype naming --- xarray/backends/api.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4727dcefb77..ece60a2b161 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -451,7 +451,7 @@ def open_dataset( decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. - mask_and_scale : bool or dict, optional + mask_and_scale : bool or dict-like, optional If True, replace array values equal to `_FillValue` with NA and scale values according to the formula `original_values * scale_factor + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are @@ -461,13 +461,13 @@ def open_dataset( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool, optional + decode_times : bool or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_timedelta : bool, optional + decode_timedelta : bool or dict-like, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. @@ -475,7 +475,7 @@ def open_dataset( Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - use_cftime: bool, optional + use_cftime: bool or dict-like, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not specified). If None (default), attempt to decode times to @@ -487,7 +487,7 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - concat_characters : bool, optional + concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only