Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

correctly encode/decode _FillValues/missing_values/dtypes for packed data #8713

Merged
merged 19 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/user-guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ __ https://numpy.org/doc/stable/user/basics.indexing.html#assigning-values-to-in
You can also assign values to all variables of a :py:class:`Dataset` at once:

.. ipython:: python
:okwarning:

ds_org = xr.tutorial.open_dataset("eraint_uvz").isel(
latitude=slice(56, 59), longitude=slice(255, 258), level=0
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ Bug fixes
when used in :py:meth:`DataArray.expand_dims` and
::py:meth:`Dataset.expand_dims` (:pull:`8781`). By `Spencer
Clark <https://github.com/spencerkclark>`_.
- CF conform handling of `_FillValue`/`missing_value` and `dtype` in
`CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`,
:issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
183 changes: 126 additions & 57 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,44 @@ def _is_time_like(units):
return any(tstr == units for tstr in time_strings)


def _check_fill_values(attrs, name, dtype):
""" "Check _FillValue and missing_value if available.

Return dictionary with raw fill values and set with encoded fill values.

Issue SerializationWarning if appropriate.
"""
raw_fill_dict = {}
[
pop_to(attrs, raw_fill_dict, attr, name=name)
for attr in ("missing_value", "_FillValue")
]
Comment on lines +272 to +276
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pops missing_values and/or _FillValue into temporary raw_fill_dict.

encoded_fill_values = set()
for k in list(raw_fill_dict):
v = raw_fill_dict[k]
kfill = {fv for fv in np.ravel(v) if not pd.isnull(fv)}
if not kfill and np.issubdtype(dtype, np.integer):
warnings.warn(
f"variable {name!r} has non-conforming {k!r} "
f"{v!r} defined, dropping {k!r} entirely.",
SerializationWarning,
stacklevel=3,
)
del raw_fill_dict[k]
else:
encoded_fill_values |= kfill
Comment on lines +278 to +290
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iterate over the (two) possible keys and extract the provided fill values. If the extracted fill values are empty (due to filtering with pd.isnull) a warning is issued for integer type data and the according key is deleted from the dict. This prevents from moving the nonconforming fill value into encoding.


if len(encoded_fill_values) > 1:
warnings.warn(
f"variable {name!r} has multiple fill values "
f"{encoded_fill_values} defined, decoding all values to NaN.",
SerializationWarning,
stacklevel=3,
)
Comment on lines +292 to +298
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have multiple fill values after the procedure a warning is issued.


return raw_fill_dict, encoded_fill_values


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

Expand All @@ -283,67 +321,56 @@ def encode(self, variable: Variable, name: T_Name = None):
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

# special case DateTime to properly handle NaT
is_time_like = _is_time_like(attrs.get("units"))

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

# apply fillna
if not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)

def decode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_decoding(variable)

raw_fill_values = [
pop_to(attrs, encoding, attr, name=name)
for attr in ("missing_value", "_FillValue")
]
if raw_fill_values:
encoded_fill_values = {
fv
for option in raw_fill_values
for fv in np.ravel(option)
if not pd.isnull(fv)
}

if len(encoded_fill_values) > 1:
warnings.warn(
"variable {!r} has multiple fill values {}, "
"decoding all values to NaN.".format(name, encoded_fill_values),
SerializationWarning,
stacklevel=3,
)
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)

# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move remaining key, value pairs into encoding.

for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
Expand All @@ -366,20 +393,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
def _choose_float_dtype(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function checks for the most appropriate dtype to use when encoding/decoding in CFScaleOffsetCoder.

dtype: np.dtype, mapping: MutableMapping
) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# check scale/offset first to derive wanted float dtype
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
scale_factor = mapping.get("scale_factor")
add_offset = mapping.get("add_offset")
if scale_factor is not None or add_offset is not None:
# get the type from scale_factor/add_offset to determine
# the needed floating point type
if scale_factor is not None:
scale_type = np.dtype(type(scale_factor))
if add_offset is not None:
offset_type = np.dtype(type(add_offset))
# CF conforming, both scale_factor and add-offset are given and
# of same floating point type (float32/64)
if (
add_offset is not None
and scale_factor is not None
and offset_type == scale_type
and scale_type in [np.float32, np.float64]
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
):
# in case of int32 -> we need upcast to float64
# due to precision issues
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
return np.float64
return scale_type.type
# Not CF conforming and add_offset given:
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if add_offset is not None:
return np.float64
# return dtype depending on given scale_factor
return scale_type.type
# If no scale_factor or add_offset is given, use some general rules.
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
Expand All @@ -396,7 +454,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
# if we have a _FillValue/masked_value we do not want to cast now
# but leave that to CFMaskCoder
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(data.dtype, encoding)
# but still we need a copy prevent changing original data
data = duck_array_ops.astype(data, dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
Expand All @@ -412,11 +475,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
# if we have a _FillValue/masked_value we already have the wanted
# floating point dtype here (via CFMaskCoder), so no check is necessary
# only check in other cases
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(dtype, encoding)

transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
Expand Down
Loading
Loading