Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/nsidc/earthaccess into cach…
Browse files Browse the repository at this point in the history
…e-s3-creds
  • Loading branch information
jrbourbeau committed Nov 29, 2023
2 parents ce741a8 + 69f9e46 commit ad85cd4
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 118 deletions.
67 changes: 0 additions & 67 deletions .github/workflows/documentation.yml

This file was deleted.

22 changes: 15 additions & 7 deletions .github/workflows/issue-manager.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,28 @@ name: "Issue Manager"

on:
schedule:
- cron: "0 0 * * *"
- cron: "0 0 * * *"
issue_comment:
types:
- "created"
issues:
types:
- "labeled"
pull_request_target:
types:
- "labeled"

jobs:
issue-manager:
runs-on: ubuntu-latest
runs-on: "ubuntu-latest"
steps:
- uses: tiangolo/issue-manager@master
- uses: "tiangolo/issue-manager@0.4.0"
with:
token: ${{ secrets.GITHUB_TOKEN }}
token: "${{ secrets.GITHUB_TOKEN }}"
config: >
{
"answered": {
"users": ["betolink"],
"feedback requested": {
"delay": 864000,
"message": "Assuming the original issue was solved, it will be automatically closed now. But feel free to add more comments or create new issues."
"message": "Closing after 10 days of waiting for feedback. If you feel this was in error, please re-open, `@` a maintainer, or create new issues."
}
}
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## [unreleased]
* Fix zero granules being reported for restricted datasets

## [v0.7.1] 2023-11-08
* Bug Fixes:
* Treat granules without `RelatedUrls` as not cloud-hosted.
Expand Down
21 changes: 20 additions & 1 deletion earthaccess/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def hits(self) -> int:
Returns:
number of results reproted by CMR
number of results reported by CMR
"""
return super().hits()

Expand Down Expand Up @@ -318,6 +318,25 @@ def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None:

self._debug = False

def hits(self) -> int:
"""
Returns the number of hits the current query will return. This is done by
making a lightweight query to CMR and inspecting the returned headers.
:returns: number of results reported by CMR
"""

url = self._build_url()

response = self.session.get(url, headers=self.headers, params={"page_size": 0})

try:
response.raise_for_status()
except exceptions.HTTPError as ex:
raise RuntimeError(ex.response.text)

return int(response.headers["CMR-Hits"])

def parameters(self, **kwargs: Any) -> Type[CollectionQuery]:
"""Provide query parameters as keyword arguments. The keyword needs to match the name
of the method, and the value should either be the value or a tuple of values.
Expand Down
73 changes: 30 additions & 43 deletions earthaccess/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def open(
self,
granules: Union[List[str], List[DataGranule]],
provider: Optional[str] = None,
) -> Union[List[Any], None]:
) -> List[Any]:
"""Returns a list of fsspec file-like objects that can be used to access files
hosted on S3 or HTTPS by third party libraries like xarray.
Expand All @@ -289,15 +289,14 @@ def open(
"""
if len(granules):
return self._open(granules, provider)
print("The granules list is empty, moving on...")
return None
return []

@singledispatchmethod
def _open(
self,
granules: Union[List[str], List[DataGranule]],
provider: Optional[str] = None,
) -> Union[List[Any], None]:
) -> List[Any]:
"""Returns a list of fsspec file-like objects that can be used to access files
hosted on S3 or HTTPS by third party libraries like xarray.
Expand All @@ -314,17 +313,16 @@ def _open_granules(
granules: List[DataGranule],
provider: Optional[str] = None,
threads: Optional[int] = 8,
) -> Union[List[Any], None]:
) -> List[Any]:
fileset: List = []
data_links: List = []
total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
print(f" Opening {len(granules)} granules, approx size: {total_size} GB")
print(f"Opening {len(granules)} granules, approx size: {total_size} GB")

if self.auth is None:
print(
raise ValueError(
"A valid Earthdata login instance is required to retrieve credentials"
)
return None

if self.running_in_aws:
if granules[0].cloud_hosted:
Expand Down Expand Up @@ -356,13 +354,12 @@ def _open_granules(
fs=s3_fs,
threads=threads,
)
except Exception:
print(
"An exception occurred while trying to access remote files on S3: "
"This may be caused by trying to access the data outside the us-west-2 region"
except Exception as e:
raise RuntimeError(
"An exception occurred while trying to access remote files on S3. "
"This may be caused by trying to access the data outside the us-west-2 region."
f"Exception: {traceback.format_exc()}"
)
return None
) from e
else:
fileset = self._open_urls_https(data_links, granules, threads=threads)
return fileset
Expand All @@ -382,7 +379,7 @@ def _open_urls(
granules: List[str],
provider: Optional[str] = None,
threads: Optional[int] = 8,
) -> Union[List[Any], None]:
) -> List[Any]:
fileset: List = []
data_links: List = []

Expand All @@ -393,15 +390,13 @@ def _open_urls(
provider = provider
data_links = granules
else:
print(
raise ValueError(
f"Schema for {granules[0]} is not recognized, must be an HTTP or S3 URL"
)
return None
if self.auth is None:
print(
raise ValueError(
"A valid Earthdata login instance is required to retrieve S3 credentials"
)
return None

if self.running_in_aws and granules[0].startswith("s3"):
if provider is not None:
Expand All @@ -414,27 +409,24 @@ def _open_urls(
fs=s3_fs,
threads=threads,
)
except Exception:
print(
"An exception occurred while trying to access remote files on S3: "
"This may be caused by trying to access the data outside the us-west-2 region"
except Exception as e:
raise RuntimeError(
"An exception occurred while trying to access remote files on S3. "
"This may be caused by trying to access the data outside the us-west-2 region."
f"Exception: {traceback.format_exc()}"
)
return None
) from e
else:
print(f"Provider {provider} has no valid cloud credentials")
return fileset
else:
print(
raise ValueError(
"earthaccess cannot derive the DAAC provider from URLs only, a provider is needed e.g. POCLOUD"
)
return None
else:
if granules[0].startswith("s3"):
print(
raise ValueError(
"We cannot open S3 links when we are not in-region, try using HTTPS links"
)
return None
fileset = self._open_urls_https(data_links, granules, threads)
return fileset

Expand All @@ -444,7 +436,7 @@ def get(
local_path: Optional[str] = None,
provider: Optional[str] = None,
threads: int = 8,
) -> Union[None, List[str]]:
) -> List[str]:
"""Retrieves data granules from a remote storage system.
* If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
Expand Down Expand Up @@ -472,8 +464,7 @@ def get(
files = self._get(granules, local_path, provider, threads)
return files
else:
print("List of URLs or DataGranule isntances expected")
return None
raise ValueError("List of URLs or DataGranule isntances expected")

@singledispatchmethod
def _get(
Expand All @@ -482,7 +473,7 @@ def _get(
local_path: str,
provider: Optional[str] = None,
threads: int = 8,
) -> Union[None, List[str]]:
) -> List[str]:
"""Retrieves data granules from a remote storage system.
* If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
Expand All @@ -500,8 +491,7 @@ def _get(
Returns:
None
"""
print("List of URLs or DataGranule isntances expected")
return None
raise NotImplementedError(f"Cannot _get {granules}")

@_get.register
def _get_urls(
Expand All @@ -510,15 +500,14 @@ def _get_urls(
local_path: str,
provider: Optional[str] = None,
threads: int = 8,
) -> Union[None, List[str]]:
) -> List[str]:
data_links = granules
downloaded_files: List = []
if provider is None and self.running_in_aws and "cumulus" in data_links[0]:
print(
raise ValueError(
"earthaccess can't yet guess the provider for cloud collections, "
"we need to use one from earthaccess.list_cloud_providers()"
)
return None
if self.running_in_aws and data_links[0].startswith("s3"):
print(f"Accessing cloud dataset using provider: {provider}")
s3_fs = self.get_s3fs_session(provider=provider)
Expand All @@ -541,7 +530,7 @@ def _get_granules(
local_path: str,
provider: Optional[str] = None,
threads: int = 8,
) -> Union[None, List[str]]:
) -> List[str]:
data_links: List = []
downloaded_files: List = []
provider = granules[0]["meta"]["provider-id"]
Expand Down Expand Up @@ -624,13 +613,11 @@ def _download_onprem_granules(
:returns: None
"""
if urls is None:
print("The granules didn't provide a valid GET DATA link")
return None
raise ValueError("The granules didn't provide a valid GET DATA link")
if self.auth is None:
print(
raise ValueError(
"We need to be logged into NASA EDL in order to download data granules"
)
return []
if not os.path.exists(directory):
os.makedirs(directory)

Expand Down

0 comments on commit ad85cd4

Please sign in to comment.