From d3d6e7c9abd280638971aac6b69408dc57312663 Mon Sep 17 00:00:00 2001 From: ayushnag <35325113+ayushnag@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:32:45 -0500 Subject: [PATCH] add dmrpp docs to top level docs --- docs/user-reference/api/api.md | 6 + earthaccess/__init__.py | 2 +- earthaccess/dmrpp_zarr.py | 199 +++++++++++++++++++++++++++++++++ earthaccess/dmrppzarr.py | 197 -------------------------------- 4 files changed, 206 insertions(+), 198 deletions(-) create mode 100644 earthaccess/dmrpp_zarr.py delete mode 100644 earthaccess/dmrppzarr.py diff --git a/docs/user-reference/api/api.md b/docs/user-reference/api/api.md index 5972be8f..be0ddfb6 100644 --- a/docs/user-reference/api/api.md +++ b/docs/user-reference/api/api.md @@ -14,3 +14,9 @@ This library handles authentication with NASA’s OAuth2 API (EDL) and provides inherited_members: true show_root_heading: true show_source: false + +::: earthaccess.dmrpp_zarr + options: + inherited_members: true + show_root_heading: true + show_source: false \ No newline at end of file diff --git a/earthaccess/__init__.py b/earthaccess/__init__.py index 60978c1e..35316c65 100644 --- a/earthaccess/__init__.py +++ b/earthaccess/__init__.py @@ -21,7 +21,7 @@ search_services, ) from .auth import Auth -from .dmrppzarr import open_virtual_dataset, open_virtual_mfdataset +from .dmrpp_zarr import open_virtual_dataset, open_virtual_mfdataset from .kerchunk import consolidate_metadata from .search import DataCollection, DataCollections, DataGranule, DataGranules from .services import DataServices diff --git a/earthaccess/dmrpp_zarr.py b/earthaccess/dmrpp_zarr.py new file mode 100644 index 00000000..9445e5d9 --- /dev/null +++ b/earthaccess/dmrpp_zarr.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import earthaccess + +if TYPE_CHECKING: + import xarray as xr + + +def open_virtual_mfdataset( + granules: list[earthaccess.DataGranule], + group: str | None = None, + access: str = "indirect", + load: bool = False, + preprocess: callable | None = None, # type: ignore + parallel: bool = True, + **xr_combine_nested_kwargs: Any, +) -> xr.Dataset: + """Open multiple granules as a single virtual xarray Dataset. + + Uses NASA DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See [https://virtualizarr.readthedocs.io](https://virtualizarr.readthedocs.io) for more information on virtual xarray datasets. + + > WARNING: This feature is current experimental and may change in the future. This feature relies on DMR++ metadata files which may not always be present for your dataset and you may get a `FileNotFoundError`. + + Parameters: + granules: + The granules to open + group: + Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored. + access: + The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. + load: + Create an xarray dataset with indexes and lazy loaded data. + + When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. + preprocess: + A function to apply to each virtual dataset before combining + parallel: + Open the virtual datasets in parallel (using dask.delayed) + xr_combine_nested_kwargs: + Xarray arguments describing how to concatenate the datasets. Keyword arguments for xarray.combine_nested. + See [https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html](https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html) + + Returns: + Concatenated xarray.Dataset + + Examples: + ```python + >>> results = earthaccess.search_data(count=5, temporal=("2024"), short_name="MUR-JPL-L4-GLOB-v4.1") + >>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") + >>> vds + Size: 29GB + Dimensions: (time: 5, lat: 17999, lon: 36000) + Coordinates: + time (time) int32 20B ManifestArray>> vds.virtualize.to_kerchunk("mur_combined.json", format="json") + >>> vds = open_virtual_mfdataset(results, access="indirect", load=True, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") + >>> vds + Size: 143GB + Dimensions: (time: 5, lat: 17999, lon: 36000) + Coordinates: + * lat (lat) float32 72kB -89.99 -89.98 -89.97 ... 89.98 89.99 + * lon (lon) float32 144kB -180.0 -180.0 -180.0 ... 180.0 180.0 + * time (time) datetime64[ns] 40B 2024-01-01T09:00:00 ... 2024-... + Data variables: + analysed_sst (time, lat, lon) float64 26GB dask.array + analysis_error (time, lat, lon) float64 26GB dask.array + dt_1km_data (time, lat, lon) timedelta64[ns] 26GB dask.array + mask (time, lat, lon) float32 13GB dask.array + sea_ice_fraction (time, lat, lon) float64 26GB dask.array + sst_anomaly (time, lat, lon) float64 26GB dask.array + Attributes: (12/42) + Conventions: CF-1.7 + title: Daily MUR SST, Final product + ``` + """ + import virtualizarr as vz + import xarray as xr + + if access == "direct": + fs = earthaccess.get_s3_filesystem(results=granules[0]) + fs.storage_options["anon"] = False # type: ignore + else: + fs = earthaccess.get_fsspec_https_session() + if parallel: + import dask + + # wrap _open_virtual_dataset and preprocess with delayed + open_ = dask.delayed(vz.open_virtual_dataset) # type: ignore + if preprocess is not None: + preprocess = dask.delayed(preprocess) # type: ignore + else: + open_ = vz.open_virtual_dataset # type: ignore + vdatasets = [] + # Get list of virtual datasets (or dask delayed objects) + for g in granules: + vdatasets.append( + open_( + filepath=g.data_links(access=access)[0] + ".dmrpp", + filetype="dmrpp", # type: ignore + group=group, + indexes={}, + reader_options={"storage_options": fs.storage_options}, # type: ignore + ) + ) + if preprocess is not None: + vdatasets = [preprocess(ds) for ds in vdatasets] + if parallel: + vdatasets = dask.compute(vdatasets)[0] # type: ignore + if len(vdatasets) == 1: + vds = vdatasets[0] + else: + vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs) + if load: + refs = vds.virtualize.to_kerchunk(filepath=None, format="dict") + return xr.open_dataset( + "reference://", + engine="zarr", + chunks={}, + backend_kwargs={ + "consolidated": False, + "storage_options": { + "fo": refs, # codespell:ignore + "remote_protocol": fs.protocol, + "remote_options": fs.storage_options, # type: ignore + }, + }, + ) + return vds + + +def open_virtual_dataset( + granule: earthaccess.DataGranule, + group: str | None = None, + access: str = "indirect", + load: bool = False, +) -> xr.Dataset: + """Open a granule as a single virtual xarray Dataset. + + Uses NASA DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See [https://virtualizarr.readthedocs.io](https://virtualizarr.readthedocs.io) for more information on virtual xarray datasets. + + > WARNING: This feature is current experimental and may change in the future. This feature relies on DMR++ metadata files which may not always be present for your dataset and you may get a `FileNotFoundError`. + + Parameters: + granule: + The granule to open + group: + Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored. + access: + The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. + load: + Create an xarray dataset with indexes and lazy loaded data. + + When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. + + Returns: + ---------- + xr.Dataset + + Examples: + ---------- + >>> results = earthaccess.search_data(count=2, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0") + >>> vds = earthaccess.open_virtual_dataset(results[0], access="indirect", load=False) + >>> vds + Size: 149MB + Dimensions: (num_lines: 9866, num_pixels: 69, + num_sides: 2) + Coordinates: + longitude (num_lines, num_pixels) int32 3MB ... + latitude (num_lines, num_pixels) int32 3MB ... + latitude_nadir (num_lines) int32 39kB ManifestArr... + longitude_nadir (num_lines) int32 39kB ManifestArr... + Dimensions without coordinates: num_lines, num_pixels, num_sides + Data variables: (12/98) + height_cor_xover_qual (num_lines, num_pixels) uint8 681kB ManifestArray>> vds.virtualize.to_kerchunk("swot_2023_ref.json", format="json") + """ + return open_virtual_mfdataset( + granules=[granule], + group=group, + access=access, + load=load, + parallel=False, + preprocess=None, + ) diff --git a/earthaccess/dmrppzarr.py b/earthaccess/dmrppzarr.py deleted file mode 100644 index aafb0709..00000000 --- a/earthaccess/dmrppzarr.py +++ /dev/null @@ -1,197 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import earthaccess - -if TYPE_CHECKING: - import xarray as xr - - -def open_virtual_mfdataset( - granules: list[earthaccess.DataGranule], - group: str | None = None, - access: str = "indirect", - load: bool = False, - preprocess: callable | None = None, # type: ignore - parallel: bool = True, - **xr_combine_nested_kwargs: Any, -) -> xr.Dataset: - """Open multiple granules as a single virtual xarray Dataset. WARNING: This feature is current experimental and may change in the future. This feature relies on dmr++ metadata files which may not always be present for your dataset. - - Uses DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. - - Parameters - ---------- - granules : list[earthaccess.DataGranule] - The granules to open - group : str or None (default=None) - The group to open in the DMR++. If groups are present in the DMR++ files, this will open the specified group. If None, the root group will be opened. - If the DMR++ file does not have groups, this parameter is ignored. - access : str (default="indirect") - The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. - load : bool (default=False) - Create an xarray dataset with indexes and lazy loaded data. - - When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. - preprocess : callable (default=None) - A function to apply to each virtual dataset before combining - parallel : bool (default=True) - Open the virtual datasets in parallel (using dask.delayed) - xr_combine_nested_kwargs : dict - Keyword arguments for xarray.combine_nested. - See https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html - - Returns: - ---------- - xr.Dataset - - Examplea: - ---------- - >>> results = earthaccess.search_data(count=5, temporal=("2024"), short_name="MUR-JPL-L4-GLOB-v4.1") - >>> vds = open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") - >>> vds - Size: 29GB - Dimensions: (time: 5, lat: 17999, lon: 36000) - Coordinates: - time (time) int32 20B ManifestArray>> vds = open_virtual_mfdataset(results, access="indirect", load=True, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts") - >>> vds - Size: 143GB - Dimensions: (time: 5, lat: 17999, lon: 36000) - Coordinates: - * lat (lat) float32 72kB -89.99 -89.98 -89.97 ... 89.98 89.99 - * lon (lon) float32 144kB -180.0 -180.0 -180.0 ... 180.0 180.0 - * time (time) datetime64[ns] 40B 2024-01-01T09:00:00 ... 2024-... - Data variables: - analysed_sst (time, lat, lon) float64 26GB dask.array - analysis_error (time, lat, lon) float64 26GB dask.array - dt_1km_data (time, lat, lon) timedelta64[ns] 26GB dask.array - mask (time, lat, lon) float32 13GB dask.array - sea_ice_fraction (time, lat, lon) float64 26GB dask.array - sst_anomaly (time, lat, lon) float64 26GB dask.array - Attributes: (12/42) - Conventions: CF-1.7 - title: Daily MUR SST, Final product - """ - import virtualizarr as vz - import xarray as xr - - if access == "direct": - fs = earthaccess.get_s3_filesystem(results=granules[0]) - fs.storage_options["anon"] = False # type: ignore - else: - fs = earthaccess.get_fsspec_https_session() - if parallel: - import dask - - # wrap _open_virtual_dataset and preprocess with delayed - open_ = dask.delayed(vz.open_virtual_dataset) # type: ignore - if preprocess is not None: - preprocess = dask.delayed(preprocess) # type: ignore - else: - open_ = vz.open_virtual_dataset # type: ignore - vdatasets = [] - # Get list of virtual datasets (or dask delayed objects) - for g in granules: - vdatasets.append( - open_( - filepath=g.data_links(access=access)[0] + ".dmrpp", - filetype="dmrpp", # type: ignore - group=group, - indexes={}, - reader_options={"storage_options": fs.storage_options}, # type: ignore - ) - ) - if preprocess is not None: - vdatasets = [preprocess(ds) for ds in vdatasets] - if parallel: - vdatasets = dask.compute(vdatasets)[0] # type: ignore - if len(vdatasets) == 1: - vds = vdatasets[0] - else: - vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs) - if load: - refs = vds.virtualize.to_kerchunk(filepath=None, format="dict") - return xr.open_dataset( - "reference://", - engine="zarr", - chunks={}, - backend_kwargs={ - "consolidated": False, - "storage_options": { - "fo": refs, # codespell:ignore - "remote_protocol": fs.protocol, - "remote_options": fs.storage_options, # type: ignore - }, - }, - ) - return vds - - -def open_virtual_dataset( - granule: earthaccess.DataGranule, - group: str | None = None, - access: str = "indirect", - load: bool = False, -) -> xr.Dataset: - """Open a granule as a single virtual xarray Dataset. WARNING: This feature is current experimental and may change in the future. This feature relies on dmr++ metadata files which may not always be present for your dataset. - - Uses DMR++ metadata files to create a virtual xarray dataset with ManifestArrays. This virtual dataset can be used to create zarr reference files. See https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. - - Parameters - ---------- - granule : earthaccess.DataGranule - The granule to open - group : str or None (default=None) - The group to open in the DMR++. If groups are present in the DMR++ files, this will open the specified group. If None, the root group will be opened. - If the DMR++ file does not have groups, this parameter is ignored. - access : str (default="indirect") - The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine. - load: bool (default=False) - Create an xarray dataset with indexes and lazy loaded data. - - When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets. - - - Returns: - ---------- - xr.Dataset - - Examples: - ---------- - >>> results = earthaccess.search_data(count=2, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0") - >>> open_virtual_dataset(results[0], access="indirect", load=False) - Size: 149MB - Dimensions: (num_lines: 9866, num_pixels: 69, - num_sides: 2) - Coordinates: - longitude (num_lines, num_pixels) int32 3MB ... - latitude (num_lines, num_pixels) int32 3MB ... - latitude_nadir (num_lines) int32 39kB ManifestArr... - longitude_nadir (num_lines) int32 39kB ManifestArr... - Dimensions without coordinates: num_lines, num_pixels, num_sides - Data variables: (12/98) - height_cor_xover_qual (num_lines, num_pixels) uint8 681kB ManifestArray