diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml deleted file mode 100644 index 975ae16b..00000000 --- a/.github/workflows/documentation.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: Documentation - -on: - push: - branches: - - main - - documentation - paths: - - earthaccess/** - - tests/** - - docs/** - - notebooks/** - - .github/workflows/documentation.yml - pull_request: - paths: - - earthaccess/** - - tests/** - - docs/** - - notebooks/** - - .github/workflows/documentation.yml - types: [opened, synchronize] - -jobs: - documentation: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python - id: setup-python - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install and configure Poetry - uses: snok/install-poetry@v1 - with: - version: 1.5.1 - virtualenvs-create: true - virtualenvs-in-project: false - virtualenvs-path: .venv - installer-parallel: true - - name: Set up cache - uses: actions/cache@v1 - id: cache - with: - path: .venv - key: venv-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} - - name: Ensure cache is healthy - if: steps.cache.outputs.cache-hit == 'true' - run: poetry run pip --version >/dev/null 2>&1 || rm -rf .venv - - name: Install Dependencies - if: steps.cache.outputs.cache-hit != 'true' - run: poetry install --no-interaction - - name: Build docs - run: poetry run bash scripts/build-docs.sh - env: - EARTHDATA_USERNAME: ${{ secrets.EDL_USERNAME }} - EARTHDATA_PASSWORD: ${{ secrets.EDL_PASSWORD }} - - - name: Deploy - if: | - github.event_name == 'push' - && (github.ref == 'refs/heads/main' || github.ref == 'ref/heads/documentation') - && github.repository == 'nsidc/earthaccess' - uses: peaceiris/actions-gh-pages@v3 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./site diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 46c544b2..8a3e027a 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -2,20 +2,28 @@ name: "Issue Manager" on: schedule: - - cron: "0 0 * * *" + - cron: "0 0 * * *" + issue_comment: + types: + - "created" + issues: + types: + - "labeled" + pull_request_target: + types: + - "labeled" jobs: issue-manager: - runs-on: ubuntu-latest + runs-on: "ubuntu-latest" steps: - - uses: tiangolo/issue-manager@master + - uses: "tiangolo/issue-manager@0.4.0" with: - token: ${{ secrets.GITHUB_TOKEN }} + token: "${{ secrets.GITHUB_TOKEN }}" config: > { - "answered": { - "users": ["betolink"], + "feedback requested": { "delay": 864000, - "message": "Assuming the original issue was solved, it will be automatically closed now. But feel free to add more comments or create new issues." + "message": "Closing after 10 days of waiting for feedback. If you feel this was in error, please re-open, `@` a maintainer, or create new issues." } } diff --git a/CHANGELOG.md b/CHANGELOG.md index d7fd3a1e..6aefdb21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## [unreleased] +* Fix zero granules being reported for restricted datasets + ## [v0.7.1] 2023-11-08 * Bug Fixes: * Treat granules without `RelatedUrls` as not cloud-hosted. diff --git a/earthaccess/search.py b/earthaccess/search.py index 3fd70684..0ed3b61d 100644 --- a/earthaccess/search.py +++ b/earthaccess/search.py @@ -58,7 +58,7 @@ def hits(self) -> int: Returns: - number of results reproted by CMR + number of results reported by CMR """ return super().hits() @@ -318,6 +318,25 @@ def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None: self._debug = False + def hits(self) -> int: + """ + Returns the number of hits the current query will return. This is done by + making a lightweight query to CMR and inspecting the returned headers. + + :returns: number of results reported by CMR + """ + + url = self._build_url() + + response = self.session.get(url, headers=self.headers, params={"page_size": 0}) + + try: + response.raise_for_status() + except exceptions.HTTPError as ex: + raise RuntimeError(ex.response.text) + + return int(response.headers["CMR-Hits"]) + def parameters(self, **kwargs: Any) -> Type[CollectionQuery]: """Provide query parameters as keyword arguments. The keyword needs to match the name of the method, and the value should either be the value or a tuple of values. diff --git a/earthaccess/store.py b/earthaccess/store.py index 69bcda6a..f9a02908 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -278,7 +278,7 @@ def open( self, granules: Union[List[str], List[DataGranule]], provider: Optional[str] = None, - ) -> Union[List[Any], None]: + ) -> List[Any]: """Returns a list of fsspec file-like objects that can be used to access files hosted on S3 or HTTPS by third party libraries like xarray. @@ -289,15 +289,14 @@ def open( """ if len(granules): return self._open(granules, provider) - print("The granules list is empty, moving on...") - return None + return [] @singledispatchmethod def _open( self, granules: Union[List[str], List[DataGranule]], provider: Optional[str] = None, - ) -> Union[List[Any], None]: + ) -> List[Any]: """Returns a list of fsspec file-like objects that can be used to access files hosted on S3 or HTTPS by third party libraries like xarray. @@ -314,17 +313,16 @@ def _open_granules( granules: List[DataGranule], provider: Optional[str] = None, threads: Optional[int] = 8, - ) -> Union[List[Any], None]: + ) -> List[Any]: fileset: List = [] data_links: List = [] total_size = round(sum([granule.size() for granule in granules]) / 1024, 2) - print(f" Opening {len(granules)} granules, approx size: {total_size} GB") + print(f"Opening {len(granules)} granules, approx size: {total_size} GB") if self.auth is None: - print( + raise ValueError( "A valid Earthdata login instance is required to retrieve credentials" ) - return None if self.running_in_aws: if granules[0].cloud_hosted: @@ -356,13 +354,12 @@ def _open_granules( fs=s3_fs, threads=threads, ) - except Exception: - print( - "An exception occurred while trying to access remote files on S3: " - "This may be caused by trying to access the data outside the us-west-2 region" + except Exception as e: + raise RuntimeError( + "An exception occurred while trying to access remote files on S3. " + "This may be caused by trying to access the data outside the us-west-2 region." f"Exception: {traceback.format_exc()}" - ) - return None + ) from e else: fileset = self._open_urls_https(data_links, granules, threads=threads) return fileset @@ -382,7 +379,7 @@ def _open_urls( granules: List[str], provider: Optional[str] = None, threads: Optional[int] = 8, - ) -> Union[List[Any], None]: + ) -> List[Any]: fileset: List = [] data_links: List = [] @@ -393,15 +390,13 @@ def _open_urls( provider = provider data_links = granules else: - print( + raise ValueError( f"Schema for {granules[0]} is not recognized, must be an HTTP or S3 URL" ) - return None if self.auth is None: - print( + raise ValueError( "A valid Earthdata login instance is required to retrieve S3 credentials" ) - return None if self.running_in_aws and granules[0].startswith("s3"): if provider is not None: @@ -414,27 +409,24 @@ def _open_urls( fs=s3_fs, threads=threads, ) - except Exception: - print( - "An exception occurred while trying to access remote files on S3: " - "This may be caused by trying to access the data outside the us-west-2 region" + except Exception as e: + raise RuntimeError( + "An exception occurred while trying to access remote files on S3. " + "This may be caused by trying to access the data outside the us-west-2 region." f"Exception: {traceback.format_exc()}" - ) - return None + ) from e else: print(f"Provider {provider} has no valid cloud credentials") return fileset else: - print( + raise ValueError( "earthaccess cannot derive the DAAC provider from URLs only, a provider is needed e.g. POCLOUD" ) - return None else: if granules[0].startswith("s3"): - print( + raise ValueError( "We cannot open S3 links when we are not in-region, try using HTTPS links" ) - return None fileset = self._open_urls_https(data_links, granules, threads) return fileset @@ -444,7 +436,7 @@ def get( local_path: Optional[str] = None, provider: Optional[str] = None, threads: int = 8, - ) -> Union[None, List[str]]: + ) -> List[str]: """Retrieves data granules from a remote storage system. * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda) @@ -472,8 +464,7 @@ def get( files = self._get(granules, local_path, provider, threads) return files else: - print("List of URLs or DataGranule isntances expected") - return None + raise ValueError("List of URLs or DataGranule isntances expected") @singledispatchmethod def _get( @@ -482,7 +473,7 @@ def _get( local_path: str, provider: Optional[str] = None, threads: int = 8, - ) -> Union[None, List[str]]: + ) -> List[str]: """Retrieves data granules from a remote storage system. * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda) @@ -500,8 +491,7 @@ def _get( Returns: None """ - print("List of URLs or DataGranule isntances expected") - return None + raise NotImplementedError(f"Cannot _get {granules}") @_get.register def _get_urls( @@ -510,15 +500,14 @@ def _get_urls( local_path: str, provider: Optional[str] = None, threads: int = 8, - ) -> Union[None, List[str]]: + ) -> List[str]: data_links = granules downloaded_files: List = [] if provider is None and self.running_in_aws and "cumulus" in data_links[0]: - print( + raise ValueError( "earthaccess can't yet guess the provider for cloud collections, " "we need to use one from earthaccess.list_cloud_providers()" ) - return None if self.running_in_aws and data_links[0].startswith("s3"): print(f"Accessing cloud dataset using provider: {provider}") s3_fs = self.get_s3fs_session(provider=provider) @@ -541,7 +530,7 @@ def _get_granules( local_path: str, provider: Optional[str] = None, threads: int = 8, - ) -> Union[None, List[str]]: + ) -> List[str]: data_links: List = [] downloaded_files: List = [] provider = granules[0]["meta"]["provider-id"] @@ -624,13 +613,11 @@ def _download_onprem_granules( :returns: None """ if urls is None: - print("The granules didn't provide a valid GET DATA link") - return None + raise ValueError("The granules didn't provide a valid GET DATA link") if self.auth is None: - print( + raise ValueError( "We need to be logged into NASA EDL in order to download data granules" ) - return [] if not os.path.exists(directory): os.makedirs(directory)