diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 01a2b4afc55..c0a3e6bbf4e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: - id: mixed-line-ending - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.4.7' + rev: 'v0.5.0' hooks: - id: ruff args: ["--fix", "--show-fixes"] @@ -30,7 +30,7 @@ repos: additional_dependencies: ["black==24.4.2"] - id: blackdoc-autoupdate-black - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.10.1 hooks: - id: mypy # Copied from setup.cfg diff --git a/doc/conf.py b/doc/conf.py index d0a26e19a84..630563f81e2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -97,7 +97,7 @@ } # sphinx-copybutton configurations -copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.{3,}: | {5,8}: " copybutton_prompt_is_regexp = True # nbsphinx configurations diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst index 4352dd3df5b..a979abe34e2 100644 --- a/doc/internals/how-to-add-new-backend.rst +++ b/doc/internals/how-to-add-new-backend.rst @@ -4,7 +4,7 @@ How to add a new backend ------------------------ Adding a new backend for read support to Xarray does not require -to integrate any code in Xarray; all you need to do is: +one to integrate any code in Xarray; all you need to do is: - Create a class that inherits from Xarray :py:class:`~xarray.backends.BackendEntrypoint` and implements the method ``open_dataset`` see :ref:`RST backend_entrypoint` diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index da414bc383e..85c47334858 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -19,6 +19,81 @@ format (recommended). np.random.seed(123456) +You can `read different types of files `_ +in `xr.open_dataset` by specifying the engine to be used: + +.. ipython:: python + :okexcept: + :suppress: + + import xarray as xr + + xr.open_dataset("my_file.grib", engine="cfgrib") + +The "engine" provides a set of instructions that tells xarray how +to read the data and pack them into a `dataset` (or `dataarray`). +These instructions are stored in an underlying "backend". + +Xarray comes with several backends that cover many common data formats. +Many more backends are available via external libraries, or you can `write your own `_. +This diagram aims to help you determine - based on the format of the file you'd like to read - +which type of backend you're using and how to use it. + +Text and boxes are clickable for more information. +Following the diagram is detailed information on many popular backends. +You can learn more about using and developing backends in the +`Xarray tutorial JupyterBook `_. + +.. mermaid:: + :alt: Flowchart illustrating how to choose the right backend engine to read your data + + flowchart LR + built-in-eng["""Is your data stored in one of these formats? + - netCDF4 (netcdf4) + - netCDF3 (scipy) + - Zarr (zarr) + - DODS/OPeNDAP (pydap) + - HDF5 (h5netcdf) + """] + + built-in("""You're in luck! Xarray bundles a backend for this format. + Open data using xr.open_dataset(). We recommend + always setting the engine you want to use.""") + + installed-eng["""One of these formats? + - GRIB (cfgrib) + - TileDB (tiledb) + - GeoTIFF, JPEG-2000, ESRI-hdf (rioxarray, via GDAL) + - Sentinel-1 SAFE (xarray-sentinel) + """] + + installed("""Install the package indicated in parentheses to your + Python environment. Restart the kernel and use + xr.open_dataset(files, engine='rioxarray').""") + + other("""Ask around to see if someone in your data community + has created an Xarray backend for your data type. + If not, you may need to create your own or consider + exporting your data to a more common format.""") + + built-in-eng -->|Yes| built-in + built-in-eng -->|No| installed-eng + + installed-eng -->|Yes| installed + installed-eng -->|No| other + + click built-in-eng "https://docs.xarray.dev/en/stable/getting-started-guide/faq.html#how-do-i-open-format-x-file-as-an-xarray-dataset" + click other "https://docs.xarray.dev/en/stable/internals/how-to-add-new-backend.html" + + classDef quesNodefmt fill:#9DEEF4,stroke:#206C89,text-align:left + class built-in-eng,installed-eng quesNodefmt + + classDef ansNodefmt fill:#FFAA05,stroke:#E37F17,text-align:left,white-space:nowrap + class built-in,installed,other ansNodefmt + + linkStyle default font-size:20pt,color:#206C89 + + .. _io.netcdf: netCDF diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e14b064aeda..fe678e7b7ee 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -84,11 +84,14 @@ Bug fixes Documentation ~~~~~~~~~~~~~ -- Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 `_). +- Adds intro to backend section of docs, including a flow-chart to navigate types of backends (:pull:`9175`). + By `Jessica Scheick `_. +- Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 `_, :pull:`9147`). By `Jessica Scheick `_. - Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`) By `Maximilian Roos `_. - +- Fix copybutton for multi line examples and double digit ipython cell numbers (:pull:`9264`). + By `Moritz Schreiber `_. Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 85a1a6e214c..8c526ddb58d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -806,7 +806,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No for k2, v2 in attrs.items(): encoded_attrs[k2] = self.encode_attribute(v2) - if coding.strings.check_vlen_dtype(dtype) == str: + if coding.strings.check_vlen_dtype(dtype) is str: dtype = str if self._write_empty is not None: diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index cd902257902..ef01f4cc79a 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -507,7 +507,7 @@ def __contains__(self, key: Any) -> bool: result = self.get_loc(key) return ( is_scalar(result) - or type(result) == slice + or isinstance(result, slice) or (isinstance(result, np.ndarray) and result.size > 0) ) except (KeyError, TypeError, ValueError): diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index db95286f6aa..d16ec52d645 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -39,11 +39,11 @@ def check_vlen_dtype(dtype): def is_unicode_dtype(dtype): - return dtype.kind == "U" or check_vlen_dtype(dtype) == str + return dtype.kind == "U" or check_vlen_dtype(dtype) is str def is_bytes_dtype(dtype): - return dtype.kind == "S" or check_vlen_dtype(dtype) == bytes + return dtype.kind == "S" or check_vlen_dtype(dtype) is bytes class EncodedStringCoder(VariableCoder): @@ -104,7 +104,7 @@ def encode_string_array(string_array, encoding="utf-8"): def ensure_fixed_length_bytes(var: Variable) -> Variable: """Ensure that a variable with vlen bytes is converted to fixed width.""" - if check_vlen_dtype(var.dtype) == bytes: + if check_vlen_dtype(var.dtype) is bytes: dims, data, attrs, encoding = unpack_for_encoding(var) # TODO: figure out how to handle this with dask data = np.asarray(data, dtype=np.bytes_) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index d19f285d2b9..8a3afe650f2 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -520,9 +520,19 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}")) if "_FillValue" in attrs: - new_fill = np.array(attrs["_FillValue"]) - # use view here to prevent OverflowError - attrs["_FillValue"] = new_fill.view(signed_dtype).item() + try: + # user provided the on-disk signed fill + new_fill = signed_dtype.type(attrs["_FillValue"]) + except OverflowError: + # user provided the in-memory unsigned fill, convert to signed type + unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") + # use view here to prevent OverflowError + new_fill = ( + np.array(attrs["_FillValue"], dtype=unsigned_dtype) + .view(signed_dtype) + .item() + ) + attrs["_FillValue"] = new_fill data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) return Variable(dims, data, attrs, encoding, fastpath=True) @@ -667,7 +677,7 @@ def encode(self): raise NotImplementedError def decode(self, variable: Variable, name: T_Name = None) -> Variable: - if variable.dtype == object and variable.encoding.get("dtype", False) == str: + if variable.dtype.kind == "O" and variable.encoding.get("dtype", False) is str: variable = variable.astype(variable.encoding["dtype"]) return variable else: diff --git a/xarray/conventions.py b/xarray/conventions.py index ff1256883ba..d572b215d2d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -273,7 +273,7 @@ def decode_cf_variable( var = strings.CharacterArrayCoder().decode(var, name=name) var = strings.EncodedStringCoder().decode(var) - if original_dtype == object: + if original_dtype.kind == "O": var = variables.ObjectVLenStringCoder().decode(var) original_dtype = var.dtype diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 152a9ec40e9..4fa8736427d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { - "_FillValue": np.int8(-1), + "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", "add_offset": dtype.type(10), @@ -863,14 +863,14 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: # checks preserving vlen dtype for empty arrays GH7862 dtype = create_vlen_dtype(str) original = Dataset({"a": np.array([], dtype=dtype)}) - assert check_vlen_dtype(original["a"].dtype) == str + assert check_vlen_dtype(original["a"].dtype) is str with self.roundtrip(original) as actual: assert_identical(original, actual) if np.issubdtype(actual["a"].dtype, object): # only check metadata for capable backends # eg. NETCDF3 based backends do not roundtrip metadata if actual["a"].dtype.metadata is not None: - assert check_vlen_dtype(actual["a"].dtype) == str + assert check_vlen_dtype(actual["a"].dtype) is str else: assert actual["a"].dtype == np.dtype(" None: assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) - @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255)]) + @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255), -1, 255]) def test_roundtrip_unsigned(self, fillvalue): # regression/numpy2 test for encoding = { diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py index 51f63ea72dd..17179a44a8a 100644 --- a/xarray/tests/test_coding_strings.py +++ b/xarray/tests/test_coding_strings.py @@ -21,13 +21,13 @@ def test_vlen_dtype() -> None: dtype = strings.create_vlen_dtype(str) - assert dtype.metadata["element_type"] == str + assert dtype.metadata["element_type"] is str assert strings.is_unicode_dtype(dtype) assert not strings.is_bytes_dtype(dtype) assert strings.check_vlen_dtype(dtype) is str dtype = strings.create_vlen_dtype(bytes) - assert dtype.metadata["element_type"] == bytes + assert dtype.metadata["element_type"] is bytes assert not strings.is_unicode_dtype(dtype) assert strings.is_bytes_dtype(dtype) assert strings.check_vlen_dtype(dtype) is bytes diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index dc0b270dc51..ea518b6d677 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -564,10 +564,10 @@ def test_encode_cf_variable_with_vlen_dtype() -> None: ) encoded_v = conventions.encode_cf_variable(v) assert encoded_v.data.dtype.kind == "O" - assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) is str # empty array v = Variable(["x"], np.array([], dtype=coding.strings.create_vlen_dtype(str))) encoded_v = conventions.encode_cf_variable(v) assert encoded_v.data.dtype.kind == "O" - assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str + assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) is str diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index f7cff17bab5..31d77ca17e7 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -593,7 +593,7 @@ def test_methods(self): ds = create_test_data() dt: DataTree = DataTree(data=ds) assert ds.mean().identical(dt.ds.mean()) - assert type(dt.ds.mean()) == xr.Dataset + assert isinstance(dt.ds.mean(), xr.Dataset) def test_arithmetic(self, create_test_datatree): dt = create_test_datatree() diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index afcf10ec125..3bbae55b105 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -347,7 +347,7 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): array = rng.randint(0, 10, size=shapes).astype(dtype) elif np.issubdtype(dtype, np.bool_): array = rng.randint(0, 1, size=shapes).astype(dtype) - elif dtype == str: + elif dtype is str: array = rng.choice(["a", "b", "c", "d"], size=shapes) else: raise ValueError diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 60c173a9e52..3f3f1756e45 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -178,8 +178,8 @@ def _assertIndexedLikeNDArray(self, variable, expected_value0, expected_dtype=No # check type or dtype is consistent for both ndarray and Variable if expected_dtype is None: # check output type instead of array dtype - assert type(variable.values[0]) == type(expected_value0) - assert type(variable[0].values) == type(expected_value0) + assert type(variable.values[0]) is type(expected_value0) + assert type(variable[0].values) is type(expected_value0) elif expected_dtype is not False: assert variable.values[0].dtype == expected_dtype assert variable[0].values.dtype == expected_dtype