Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metadata consolidation utility #278

Merged
merged 13 commits into from
Dec 1, 2023
1 change: 1 addition & 0 deletions earthaccess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
search_datasets,
)
from .auth import Auth
from .kerchunk import consolidate_metadata
from .search import DataCollections, DataGranules
from .store import Store

Expand Down
3 changes: 1 addition & 2 deletions earthaccess/api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Any, Dict, List, Optional, Type, Union

import earthaccess
import requests
import s3fs
from fsspec import AbstractFileSystem

import earthaccess

from .auth import Auth
from .search import CollectionQuery, DataCollections, DataGranules, GranuleQuery
from .store import Store
Expand Down
56 changes: 56 additions & 0 deletions earthaccess/kerchunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

import earthaccess
import fsspec
import s3fs


def _get_chunk_metadata(
granuale: earthaccess.results.DataGranule,
fs: fsspec.AbstractFileSystem | s3fs.S3FileSystem,
) -> list[dict]:
from kerchunk.hdf import SingleHdf5ToZarr

metadata = []
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
for url in granuale.data_links(access=access):
with fs.open(url) as inf:
h5chunks = SingleHdf5ToZarr(inf, url)
m = h5chunks.translate()
metadata.append(m)
return metadata


def consolidate_metadata(
granuales: list[earthaccess.results.DataGranule],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we try to kerchunk and consolidate non uniform datasets kerchunk will fail, if we try to kerchunk 1 granule kerchunk succeeds but MultiZarrToZarr requires a variable mapping, if we don't pass it fails, for one granule we should use kerchunk_options={"coo_map": {}} and then kerchunks stays happy.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that the same as not doing any combination at all?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess? or maybe we can just skip the MultiZarrToZarr when the input is only 1 file. I wanted to do a quick test on a granule from https://podaac.jpl.nasa.gov/MEaSUREs-MUR and ran into this issue.

kerchunk_options: dict | None = None,
access: str = "direct",
outfile: str | None = None,
storage_options: dict | None = None,
) -> str | dict:
try:
import dask
from kerchunk.combine import MultiZarrToZarr
except ImportError as e:
raise ImportError(
"`earthaccess.consolidate_metadata` requires `dask` and `kerchunk` to be be installed"
) from e

if access == "direct":
fs = earthaccess.get_s3fs_session(provider=granuales[0]["meta"]["provider-id"])
else:
fs = earthaccess.get_fsspec_https_session()

# Get metadata for each granuale
get_chunk_metadata = dask.delayed(_get_chunk_metadata)
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granuales])
chunks = sum(chunks, start=[])

# Get combined metadata object
mzz = MultiZarrToZarr(chunks, **(kerchunk_options or {}))
if outfile is not None:
output = fsspec.utils.stringify_path(outfile)
mzz.translate(outfile, storage_options=storage_options or {})
return output
else:
return mzz.translate()
5 changes: 2 additions & 3 deletions earthaccess/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@
from functools import lru_cache
from itertools import chain
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union
from uuid import uuid4

import earthaccess
import fsspec
import requests
import s3fs
from multimethod import multimethod as singledispatchmethod
from pqdm.threads import pqdm

import earthaccess

from .daac import DAAC_TEST_URLS, find_provider
from .results import DataGranule
from .search import DataCollections
Expand Down
173 changes: 172 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ s3fs = ">=2021.11, <2024"
fsspec = ">=2022.1"
tinynetrc = "^1.3.1"
multimethod = ">=1.8"
kerchunk = { version = ">=0.1.2", optional = true }
dask = { version = ">=2022.1.0", optional = true }

[tool.poetry.extras]
kerchunk = ["kerchunk", "dask"]

[tool.poetry.dev-dependencies]
python-magic = ">=0.4"
Expand Down
Loading