Merge branch 'main' of https://github.com/nsidc/earthaccess into cach…

…e-s3-creds
nsidc · Nov 29, 2023 · ad85cd4 · ad85cd4
2 parents ce741a8 + 69f9e46
commit ad85cd4
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 118 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml
@@ -2,20 +2,28 @@ name: "Issue Manager"
 
 on:
   schedule:
-  - cron: "0 0 * * *"
+    - cron: "0 0 * * *"
+  issue_comment:
+    types:
+      - "created"
+  issues:
+    types:
+      - "labeled"
+  pull_request_target:
+    types:
+      - "labeled"
 
 jobs:
   issue-manager:
-    runs-on: ubuntu-latest
+    runs-on: "ubuntu-latest"
     steps:
-    - uses: tiangolo/issue-manager@master
+    - uses: "tiangolo/issue-manager@0.4.0"
       with:
-        token: ${{ secrets.GITHUB_TOKEN }}
+        token: "${{ secrets.GITHUB_TOKEN }}"
         config: >
             {
-              "answered": {
-                "users": ["betolink"],
+              "feedback requested": {
                 "delay": 864000,
-                "message": "Assuming the original issue was solved, it will be automatically closed now. But feel free to add more comments or create new issues."
+                "message": "Closing after 10 days of waiting for feedback. If you feel this was in error, please re-open, `@` a maintainer, or create new issues."
               }
             }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## [unreleased]
+* Fix zero granules being reported for restricted datasets
+
 ## [v0.7.1] 2023-11-08
 * Bug Fixes:
     * Treat granules without `RelatedUrls` as not cloud-hosted.

diff --git a/earthaccess/search.py b/earthaccess/search.py
@@ -58,7 +58,7 @@ def hits(self) -> int:
 
 
         Returns:
-            number of results reproted by CMR
+            number of results reported by CMR
         """
         return super().hits()
 
@@ -318,6 +318,25 @@ def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None:
 
         self._debug = False
 
+    def hits(self) -> int:
+        """
+        Returns the number of hits the current query will return. This is done by
+        making a lightweight query to CMR and inspecting the returned headers.
+
+        :returns: number of results reported by CMR
+        """
+
+        url = self._build_url()
+
+        response = self.session.get(url, headers=self.headers, params={"page_size": 0})
+
+        try:
+            response.raise_for_status()
+        except exceptions.HTTPError as ex:
+            raise RuntimeError(ex.response.text)
+
+        return int(response.headers["CMR-Hits"])
+
     def parameters(self, **kwargs: Any) -> Type[CollectionQuery]:
         """Provide query parameters as keyword arguments. The keyword needs to match the name
         of the method, and the value should either be the value or a tuple of values.

diff --git a/earthaccess/store.py b/earthaccess/store.py
@@ -278,7 +278,7 @@ def open(
         self,
         granules: Union[List[str], List[DataGranule]],
         provider: Optional[str] = None,
-    ) -> Union[List[Any], None]:
+    ) -> List[Any]:
         """Returns a list of fsspec file-like objects that can be used to access files
         hosted on S3 or HTTPS by third party libraries like xarray.
 
@@ -289,15 +289,14 @@ def open(
         """
         if len(granules):
             return self._open(granules, provider)
-        print("The granules list is empty, moving on...")
-        return None
+        return []
 
     @singledispatchmethod
     def _open(
         self,
         granules: Union[List[str], List[DataGranule]],
         provider: Optional[str] = None,
-    ) -> Union[List[Any], None]:
+    ) -> List[Any]:
         """Returns a list of fsspec file-like objects that can be used to access files
         hosted on S3 or HTTPS by third party libraries like xarray.
 
@@ -314,17 +313,16 @@ def _open_granules(
         granules: List[DataGranule],
         provider: Optional[str] = None,
         threads: Optional[int] = 8,
-    ) -> Union[List[Any], None]:
+    ) -> List[Any]:
         fileset: List = []
         data_links: List = []
         total_size = round(sum([granule.size() for granule in granules]) / 1024, 2)
-        print(f" Opening {len(granules)} granules, approx size: {total_size} GB")
+        print(f"Opening {len(granules)} granules, approx size: {total_size} GB")
 
         if self.auth is None:
-            print(
+            raise ValueError(
                 "A valid Earthdata login instance is required to retrieve credentials"
             )
-            return None
 
         if self.running_in_aws:
             if granules[0].cloud_hosted:
@@ -356,13 +354,12 @@ def _open_granules(
                         fs=s3_fs,
                         threads=threads,
                     )
-                except Exception:
-                    print(
-                        "An exception occurred while trying to access remote files on S3: "
-                        "This may be caused by trying to access the data outside the us-west-2 region"
+                except Exception as e:
+                    raise RuntimeError(
+                        "An exception occurred while trying to access remote files on S3. "
+                        "This may be caused by trying to access the data outside the us-west-2 region."
                         f"Exception: {traceback.format_exc()}"
-                    )
-                    return None
+                    ) from e
             else:
                 fileset = self._open_urls_https(data_links, granules, threads=threads)
             return fileset
@@ -382,7 +379,7 @@ def _open_urls(
         granules: List[str],
         provider: Optional[str] = None,
         threads: Optional[int] = 8,
-    ) -> Union[List[Any], None]:
+    ) -> List[Any]:
         fileset: List = []
         data_links: List = []
 
@@ -393,15 +390,13 @@ def _open_urls(
             provider = provider
             data_links = granules
         else:
-            print(
+            raise ValueError(
                 f"Schema for {granules[0]} is not recognized, must be an HTTP or S3 URL"
             )
-            return None
         if self.auth is None:
-            print(
+            raise ValueError(
                 "A valid Earthdata login instance is required to retrieve S3 credentials"
             )
-            return None
 
         if self.running_in_aws and granules[0].startswith("s3"):
             if provider is not None:
@@ -414,27 +409,24 @@ def _open_urls(
                             fs=s3_fs,
                             threads=threads,
                         )
-                    except Exception:
-                        print(
-                            "An exception occurred while trying to access remote files on S3: "
-                            "This may be caused by trying to access the data outside the us-west-2 region"
+                    except Exception as e:
+                        raise RuntimeError(
+                            "An exception occurred while trying to access remote files on S3. "
+                            "This may be caused by trying to access the data outside the us-west-2 region."
                             f"Exception: {traceback.format_exc()}"
-                        )
-                        return None
+                        ) from e
                 else:
                     print(f"Provider {provider} has no valid cloud credentials")
                 return fileset
             else:
-                print(
+                raise ValueError(
                     "earthaccess cannot derive the DAAC provider from URLs only, a provider is needed e.g. POCLOUD"
                 )
-                return None
         else:
             if granules[0].startswith("s3"):
-                print(
+                raise ValueError(
                     "We cannot open S3 links when we are not in-region, try using HTTPS links"
                 )
-                return None
             fileset = self._open_urls_https(data_links, granules, threads)
             return fileset
 
@@ -444,7 +436,7 @@ def get(
         local_path: Optional[str] = None,
         provider: Optional[str] = None,
         threads: int = 8,
-    ) -> Union[None, List[str]]:
+    ) -> List[str]:
         """Retrieves data granules from a remote storage system.
 
            * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
@@ -472,8 +464,7 @@ def get(
             files = self._get(granules, local_path, provider, threads)
             return files
         else:
-            print("List of URLs or DataGranule isntances expected")
-            return None
+            raise ValueError("List of URLs or DataGranule isntances expected")
 
     @singledispatchmethod
     def _get(
@@ -482,7 +473,7 @@ def _get(
         local_path: str,
         provider: Optional[str] = None,
         threads: int = 8,
-    ) -> Union[None, List[str]]:
+    ) -> List[str]:
         """Retrieves data granules from a remote storage system.
 
            * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda)
@@ -500,8 +491,7 @@ def _get(
         Returns:
             None
         """
-        print("List of URLs or DataGranule isntances expected")
-        return None
+        raise NotImplementedError(f"Cannot _get {granules}")
 
     @_get.register
     def _get_urls(
@@ -510,15 +500,14 @@ def _get_urls(
         local_path: str,
         provider: Optional[str] = None,
         threads: int = 8,
-    ) -> Union[None, List[str]]:
+    ) -> List[str]:
         data_links = granules
         downloaded_files: List = []
         if provider is None and self.running_in_aws and "cumulus" in data_links[0]:
-            print(
+            raise ValueError(
                 "earthaccess can't yet guess the provider for cloud collections, "
                 "we need to use one from earthaccess.list_cloud_providers()"
             )
-            return None
         if self.running_in_aws and data_links[0].startswith("s3"):
             print(f"Accessing cloud dataset using provider: {provider}")
             s3_fs = self.get_s3fs_session(provider=provider)
@@ -541,7 +530,7 @@ def _get_granules(
         local_path: str,
         provider: Optional[str] = None,
         threads: int = 8,
-    ) -> Union[None, List[str]]:
+    ) -> List[str]:
         data_links: List = []
         downloaded_files: List = []
         provider = granules[0]["meta"]["provider-id"]
@@ -624,13 +613,11 @@ def _download_onprem_granules(
         :returns: None
         """
         if urls is None:
-            print("The granules didn't provide a valid GET DATA link")
-            return None
+            raise ValueError("The granules didn't provide a valid GET DATA link")
         if self.auth is None:
-            print(
+            raise ValueError(
                 "We need to be logged into NASA EDL in order to download data granules"
             )
-            return []
         if not os.path.exists(directory):
             os.makedirs(directory)