diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c6ce0c6..ec21ddb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,9 @@ and this project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html) ### Added +- Add support for opening data files with virtualizarr and NASA dmrpp with `open_virtual_dataset` + ([#605](https://github.com/nsidc/earthaccess/issues/605)) + ([@ayushnag](https://github.com/ayushnag)) - Add support for `NETRC` environment variable to override default `.netrc` file location ([#480](https://github.com/nsidc/earthaccess/issues/480)) ([@chuckwondo](https://github.com/chuckwondo)) diff --git a/docs/user-reference/api/api.md b/docs/user-reference/api/api.md index 5972be8f..ee0ec581 100644 --- a/docs/user-reference/api/api.md +++ b/docs/user-reference/api/api.md @@ -14,3 +14,9 @@ This library handles authentication with NASA’s OAuth2 API (EDL) and provides inherited_members: true show_root_heading: true show_source: false + +::: earthaccess.dmrpp_zarr + options: + inherited_members: true + show_root_heading: true + show_source: false diff --git a/earthaccess/__init__.py b/earthaccess/__init__.py index 73f7ed2d..35316c65 100644 --- a/earthaccess/__init__.py +++ b/earthaccess/__init__.py @@ -21,6 +21,7 @@ search_services, ) from .auth import Auth +from .dmrpp_zarr import open_virtual_dataset, open_virtual_mfdataset from .kerchunk import consolidate_metadata from .search import DataCollection, DataCollections, DataGranule, DataGranules from .services import DataServices @@ -58,6 +59,9 @@ "Store", # kerchunk "consolidate_metadata", + # virtualizarr + "open_virtual_dataset", + "open_virtual_mfdataset", "PROD", "UAT", ] diff --git a/earthaccess/dmrpp_zarr.py b/earthaccess/dmrpp_zarr.py new file mode 100644 index 00000000..79902868 --- /dev/null +++ b/earthaccess/dmrpp_zarr.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import earthaccess + +if TYPE_CHECKING: + import xarray as xr + + +def open_virtual_mfdataset( + granules: list[earthaccess.DataGranule], + group: str | None = None, + access: str = "indirect", + load: bool = False, + preprocess: callable | None = None, # type: ignore + parallel: bool = True, + **xr_combine_nested_kwargs: Any, +) -> xr.Dataset: + """Open multiple granules as a single virtual xarray Dataset. + + Uses NASA DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See [https://virtualizarr.readthedocs.io](https://virtualizarr.readthedocs.io) for more information on virtual xarray datasets. + + > WARNING: This feature is current experimental and may change in the future. This feature relies on DMR++ metadata files which may not always be present for your dataset and you may get a `FileNotFoundError`. + + Parameters: + granules: + The granules to open + group: + Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored. + access: + The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. + load: + Create an xarray dataset with indexes and lazy loaded data. + + When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. + preprocess: + A function to apply to each virtual dataset before combining + parallel: + Open the virtual datasets in parallel (using dask.delayed) + xr_combine_nested_kwargs: + Xarray arguments describing how to concatenate the datasets. Keyword arguments for xarray.combine_nested. + See [https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html](https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html) + + Returns: + Concatenated xarray.Dataset + + Examples: + ```python + >>> results = earthaccess.search_data(count=5, temporal=("2024"), short_name="MUR-JPL-L4-GLOB-v4.1") + >>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") + >>> vds + Size: 29GB + Dimensions: (time: 5, lat: 17999, lon: 36000) + Coordinates: + time (time) int32 20B ManifestArray>> vds.virtualize.to_kerchunk("mur_combined.json", format="json") + >>> vds = open_virtual_mfdataset(results, access="indirect", load=True, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") + >>> vds + Size: 143GB + Dimensions: (time: 5, lat: 17999, lon: 36000) + Coordinates: + * lat (lat) float32 72kB -89.99 -89.98 -89.97 ... 89.98 89.99 + * lon (lon) float32 144kB -180.0 -180.0 -180.0 ... 180.0 180.0 + * time (time) datetime64[ns] 40B 2024-01-01T09:00:00 ... 2024-... + Data variables: + analysed_sst (time, lat, lon) float64 26GB dask.array + analysis_error (time, lat, lon) float64 26GB dask.array + dt_1km_data (time, lat, lon) timedelta64[ns] 26GB dask.array + mask (time, lat, lon) float32 13GB dask.array + sea_ice_fraction (time, lat, lon) float64 26GB dask.array + sst_anomaly (time, lat, lon) float64 26GB dask.array + Attributes: (12/42) + Conventions: CF-1.7 + title: Daily MUR SST, Final product + ``` + """ + import virtualizarr as vz + import xarray as xr + + if access == "direct": + fs = earthaccess.get_s3_filesystem(results=granules[0]) + fs.storage_options["anon"] = False # type: ignore + else: + fs = earthaccess.get_fsspec_https_session() + if parallel: + import dask + + # wrap _open_virtual_dataset and preprocess with delayed + open_ = dask.delayed(vz.open_virtual_dataset) # type: ignore + if preprocess is not None: + preprocess = dask.delayed(preprocess) # type: ignore + else: + open_ = vz.open_virtual_dataset # type: ignore + vdatasets = [] + # Get list of virtual datasets (or dask delayed objects) + for g in granules: + vdatasets.append( + open_( + filepath=g.data_links(access=access)[0] + ".dmrpp", + filetype="dmrpp", # type: ignore + group=group, + indexes={}, + reader_options={"storage_options": fs.storage_options}, # type: ignore + ) + ) + if preprocess is not None: + vdatasets = [preprocess(ds) for ds in vdatasets] + if parallel: + vdatasets = dask.compute(vdatasets)[0] # type: ignore + if len(vdatasets) == 1: + vds = vdatasets[0] + else: + vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs) + if load: + refs = vds.virtualize.to_kerchunk(filepath=None, format="dict") + return xr.open_dataset( + "reference://", + engine="zarr", + chunks={}, + backend_kwargs={ + "consolidated": False, + "storage_options": { + "fo": refs, # codespell:ignore + "remote_protocol": fs.protocol, + "remote_options": fs.storage_options, # type: ignore + }, + }, + ) + return vds + + +def open_virtual_dataset( + granule: earthaccess.DataGranule, + group: str | None = None, + access: str = "indirect", + load: bool = False, +) -> xr.Dataset: + """Open a granule as a single virtual xarray Dataset. + + Uses NASA DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See [https://virtualizarr.readthedocs.io](https://virtualizarr.readthedocs.io) for more information on virtual xarray datasets. + + > WARNING: This feature is current experimental and may change in the future. This feature relies on DMR++ metadata files which may not always be present for your dataset and you may get a `FileNotFoundError`. + + Parameters: + granule: + The granule to open + group: + Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored. + access: + The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. + load: + Create an xarray dataset with indexes and lazy loaded data. + + When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. + + Returns: + xarray.Dataset + + Examples: + ```python + >>> results = earthaccess.search_data(count=2, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0") + >>> vds = earthaccess.open_virtual_dataset(results[0], access="indirect", load=False) + >>> vds + Size: 149MB + Dimensions: (num_lines: 9866, num_pixels: 69, + num_sides: 2) + Coordinates: + longitude (num_lines, num_pixels) int32 3MB ... + latitude (num_lines, num_pixels) int32 3MB ... + latitude_nadir (num_lines) int32 39kB ManifestArr... + longitude_nadir (num_lines) int32 39kB ManifestArr... + Dimensions without coordinates: num_lines, num_pixels, num_sides + Data variables: (12/98) + height_cor_xover_qual (num_lines, num_pixels) uint8 681kB ManifestArray>> vds.virtualize.to_kerchunk("swot_2023_ref.json", format="json") + ``` + """ + return open_virtual_mfdataset( + granules=[granule], + group=group, + access=access, + load=load, + parallel=False, + preprocess=None, + ) diff --git a/pyproject.toml b/pyproject.toml index 5403ccba..6e227789 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,9 @@ kerchunk = [ "h5netcdf", "xarray", ] +virtualizarr = [ + "virtualizarr >=1.2.0" +] dev = [ "bump-my-version >=0.10.0", "nox", @@ -82,6 +85,7 @@ test = [ "types-setuptools >=0.1", "vcrpy >=6.0.1", "earthaccess[kerchunk]", + "earthaccess[virtualizarr]", ] docs = [ "jupyterlab >=3", diff --git a/tests/integration/test_virtualizarr.py b/tests/integration/test_virtualizarr.py new file mode 100644 index 00000000..95128a76 --- /dev/null +++ b/tests/integration/test_virtualizarr.py @@ -0,0 +1,55 @@ +import logging +import os +import unittest + +import earthaccess +import pytest + +logger = logging.getLogger(__name__) +assertions = unittest.TestCase("__init__") + +assertions.assertTrue("EARTHDATA_USERNAME" in os.environ) +assertions.assertTrue("EARTHDATA_PASSWORD" in os.environ) + +logger.info(f"Current username: {os.environ['EARTHDATA_USERNAME']}") +logger.info(f"earthaccess version: {earthaccess.__version__}") + + +@pytest.fixture(scope="module", params=["MUR25-JPL-L4-GLOB-v04.2"]) +def granule(request): + granules = earthaccess.search_data( + count=1, temporal=("2024"), short_name=request.param + ) + return granules[0] + + +def test_dmrpp(granule): + from virtualizarr import open_virtual_dataset # type: ignore + + fs = earthaccess.get_fsspec_https_session() + data_path = granule.data_links(access="indirect")[0] + dmrpp_path = data_path + ".dmrpp" + + result = open_virtual_dataset( + dmrpp_path, + filetype="dmrpp", # type: ignore + indexes={}, + reader_options={"storage_options": fs.storage_options}, # type: ignore + ) + + expected = open_virtual_dataset( + data_path, + indexes={}, + reader_options={"storage_options": fs.storage_options}, # type: ignore + ) + + # TODO: replace with xr.testing when virtualizarr fill_val is fixed (https://github.com/zarr-developers/VirtualiZarr/issues/287) + # and dmrpp deflateLevel (zlib compression level) is always present (https://github.com/OPENDAP/bes/issues/954) + for var in result.variables: + assert var in expected.variables + assert result[var].dims == expected[var].dims + assert result[var].shape == expected[var].shape + assert result[var].dtype == expected[var].dtype + assert result[var].data.manifest == expected[var].data.manifest + assert set(result.coords) == set(expected.coords) + assert result.attrs == expected.attrs diff --git a/uv.lock b/uv.lock index 20fbcd9c..6e6acce3 100644 --- a/uv.lock +++ b/uv.lock @@ -801,7 +801,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/a2/55/8f8cab2afd404cf57 [[package]] name = "earthaccess" -version = "0.11.0" +version = "0.12.0" source = { editable = "." } dependencies = [ { name = "fsspec" }, @@ -868,8 +868,12 @@ test = [ { name = "types-requests", version = "2.32.0.20241016", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12' and platform_python_implementation != 'PyPy'" }, { name = "types-setuptools" }, { name = "vcrpy" }, + { name = "virtualizarr" }, { name = "xarray" }, ] +virtualizarr = [ + { name = "virtualizarr" }, +] [package.metadata] requires-dist = [ @@ -878,6 +882,7 @@ requires-dist = [ { name = "dask", marker = "extra == 'docs'", specifier = ">=2024.8.0" }, { name = "dask", marker = "extra == 'kerchunk'" }, { name = "earthaccess", extras = ["kerchunk"], marker = "extra == 'test'" }, + { name = "earthaccess", extras = ["virtualizarr"], marker = "extra == 'test'" }, { name = "fsspec", specifier = ">=2022.11" }, { name = "h5netcdf", marker = "extra == 'docs'", specifier = ">=0.11" }, { name = "h5netcdf", marker = "extra == 'kerchunk'" }, @@ -920,6 +925,7 @@ requires-dist = [ { name = "typing-extensions", specifier = ">=4.10.0" }, { name = "uv", marker = "extra == 'dev'", specifier = ">=0.4.7" }, { name = "vcrpy", marker = "extra == 'test'", specifier = ">=6.0.1" }, + { name = "virtualizarr", marker = "extra == 'virtualizarr'", specifier = ">=1.2.0" }, { name = "widgetsnbextension", marker = "extra == 'docs'", specifier = ">=3.6.0" }, { name = "xarray", marker = "extra == 'docs'", specifier = ">=2023.1" }, { name = "xarray", marker = "extra == 'kerchunk'" }, @@ -3840,6 +3846,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bf/ecd14d3cf6127f8a990b01f0ad20e257f5619a555f47d707c57d39934894/ujson-5.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:baed37ea46d756aca2955e99525cc02d9181de67f25515c468856c38d52b5f3b", size = 42224 }, ] +[[package]] +name = "universal-pathlib" +version = "0.2.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/67/6c31ba464eafda05c677628dd7859ed4904597a78694d9cc81b593c6bad2/universal_pathlib-0.2.5.tar.gz", hash = "sha256:ea5d4fb8178c2ab469cf4fa46d0ceb16ccb378da46dbbc28a8b9c1eebdccc655", size = 174755 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/d9/289d308f889aac33639703a60906e3a0f3ec97419b7ca5bedaddc77648fd/universal_pathlib-0.2.5-py3-none-any.whl", hash = "sha256:a634f700eca827b4ad03bfa0267e51161560dd1de83b051cf0fccf39b3e56b32", size = 49892 }, +] + [[package]] name = "uri-template" version = "1.3.0" @@ -3934,6 +3952,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/92/78324ff89391e00c8f4cf6b8526c41c6ef36b4ea2d2c132250b1a6fc2b8d/virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4", size = 3117838 }, ] +[[package]] +name = "virtualizarr" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numcodecs" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "ujson" }, + { name = "universal-pathlib" }, + { name = "xarray" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8c/c1/dcc401a62173622a2fdad88c90563282449acc0b7a460c3e33c5edc9aac8/virtualizarr-1.2.0.tar.gz", hash = "sha256:0eb9aecf09fd347d1691b407ed3798f55f038a3044bd9ccd23bd9c49f90e036d", size = 129907 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/ef/316b14edf592da78580ba4bd596174965d05f268401f1dec07c0e7d6ac0b/virtualizarr-1.2.0-py3-none-any.whl", hash = "sha256:b9a71766adf407655431e0cf6e5a1909407aa6e6e6bab2cbe1a7355d789d0629", size = 128921 }, +] + [[package]] name = "watchdog" version = "6.0.0"