From dc87dd7b29de67673e726f197230ea85633c166d Mon Sep 17 00:00:00 2001 From: Sherwin-14 Date: Tue, 21 May 2024 16:34:44 +0000 Subject: [PATCH] ignored the D1* errors --- earthaccess/__init__.py | 3 +- earthaccess/api.py | 94 ++++-- earthaccess/auth.py | 24 +- earthaccess/formatters.py | 2 +- earthaccess/results.py | 88 +++--- earthaccess/search.py | 368 +++++++++++++++------- earthaccess/store.py | 71 +++-- pyproject.toml | 2 +- tests/integration/test_cloud_download.py | 9 +- tests/integration/test_cloud_open.py | 9 +- tests/integration/test_onprem_download.py | 9 +- tests/integration/test_onprem_open.py | 9 +- tests/unit/test_results.py | 33 +- 13 files changed, 461 insertions(+), 260 deletions(-) diff --git a/earthaccess/__init__.py b/earthaccess/__init__.py index 6db81f92..8d399945 100644 --- a/earthaccess/__init__.py +++ b/earthaccess/__init__.py @@ -61,8 +61,7 @@ def __getattr__(name): # type: ignore - """ - Module-level getattr to handle automatic authentication when accessing + """Module-level getattr to handle automatic authentication when accessing `earthaccess.__auth__` and `earthaccess.__store__`. Other unhandled attributes raise as `AttributeError` as expected. diff --git a/earthaccess/api.py b/earthaccess/api.py index 5e82fd60..991c4749 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -1,3 +1,4 @@ + import logging import requests @@ -37,7 +38,8 @@ def search_datasets(count: int = -1, **kwargs: Any) -> List[DataCollection]: [https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) - Parameters: + Parameters + ---------- count: Number of records to get, -1 = all kwargs (Dict): arguments to CMR: @@ -52,20 +54,24 @@ def search_datasets(count: int = -1, **kwargs: Any) -> List[DataCollection]: * **bounding_box**: a tuple representing spatial bounds in the form `(lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat)` - Returns: + Returns + ------- A list of DataCollection results that can be used to get information about a dataset, e.g. concept_id, doi, etc. - Raises: + Raises + ------ RuntimeError: The CMR query failed. - Examples: + Examples + -------- ```python datasets = earthaccess.search_datasets( keyword="sea surface anomaly", cloud_hosted=True ) ``` + """ if not validate.valid_dataset_parameters(**kwargs): logger.warn("A valid set of parameters is needed to search for datasets on CMR") @@ -86,7 +92,8 @@ def search_data(count: int = -1, **kwargs: Any) -> List[DataGranule]: [https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) - Parameters: + Parameters + ---------- count: Number of records to get, -1 = all kwargs (Dict): arguments to CMR: @@ -101,14 +108,17 @@ def search_data(count: int = -1, **kwargs: Any) -> List[DataGranule]: * **bounding_box**: a tuple representing spatial bounds in the form `(lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat)` - Returns: + Returns + ------- a list of DataGranules that can be used to access the granule files by using `download()` or `open()`. - Raises: + Raises + ------ RuntimeError: The CMR query failed. - Examples: + Examples + -------- ```python datasets = earthaccess.search_data( doi="10.5067/SLREF-CDRV2", @@ -116,6 +126,7 @@ def search_data(count: int = -1, **kwargs: Any) -> List[DataGranule]: temporal=("2002-01-01", "2002-12-31") ) ``` + """ if earthaccess.__auth__.authenticated: query = DataGranules(earthaccess.__auth__).parameters(**kwargs) @@ -131,7 +142,8 @@ def search_data(count: int = -1, **kwargs: Any) -> List[DataGranule]: def login(strategy: str = "all", persist: bool = False, system: System = PROD) -> Auth: """Authenticate with Earthdata login (https://urs.earthdata.nasa.gov/). - Parameters: + Parameters + ---------- strategy: An authentication method. @@ -142,8 +154,10 @@ def login(strategy: str = "all", persist: bool = False, system: System = PROD) - persist: will persist credentials in a .netrc file system: the Earthdata system to access, defaults to PROD - Returns: + Returns + ------- An instance of Auth. + """ # Set the underlying Auth object's earthdata system, # before triggering the getattr function for `__auth__`. @@ -181,17 +195,21 @@ def download( * If we run it outside AWS (us-west-2 region) and the dataset is cloud hosted, we'll use HTTP links. - Parameters: + Parameters + ---------- granules: a granule, list of granules, a granule link (HTTP), or a list of granule links (HTTP) local_path: local directory to store the remote data granules provider: if we download a list of URLs, we need to specify the provider. threads: parallel number of threads to use to download the files, adjust as necessary, default = 8 - Returns: + Returns + ------- List of downloaded files - Raises: + Raises + ------ Exception: A file download failed. + """ provider = _normalize_location(provider) if isinstance(granules, DataGranule): @@ -205,6 +223,7 @@ def download( f"{err}: You must call earthaccess.login() before you can download data" ) return [] + return results @@ -215,13 +234,16 @@ def open( """Returns a list of fsspec file-like objects that can be used to access files hosted on S3 or HTTPS by third party libraries like xarray. - Parameters: + Parameters + ---------- granules: a list of granule instances **or** list of URLs, e.g. `s3://some-granule`. If a list of URLs is passed, we need to specify the data provider. provider: e.g. POCLOUD, NSIDC_CPRD, etc. - Returns: + Returns + ------- a list of s3fs "file pointers" to s3 files. + """ provider = _normalize_location(provider) results = earthaccess.__store__.open(granules=granules, provider=provider) @@ -238,13 +260,16 @@ def get_s3_credentials( If we use results, earthaccess will use the metadata on the response to get the credentials, which is useful for missions that do not use the same endpoint as their DAACs, e.g. SWOT. - Parameters: + Parameters + ---------- daac: a DAAC short_name like NSIDC or PODAAC, etc. provider: if we know the provider for the DAAC, e.g. POCLOUD, LPCLOUD etc. results: List of results from search_data() - Returns: + Returns + ------- a dictionary with S3 credentials for the DAAC or provider + """ daac = _normalize_location(daac) provider = _normalize_location(provider) @@ -257,8 +282,10 @@ def get_s3_credentials( def collection_query() -> CollectionQuery: """Returns a query builder instance for NASA collections (datasets). - Returns: + Returns + ------- a query builder instance for data collections. + """ if earthaccess.__auth__.authenticated: query_builder = DataCollections(earthaccess.__auth__) @@ -268,10 +295,12 @@ def collection_query() -> CollectionQuery: def granule_query() -> GranuleQuery: - """Returns a query builder instance for data granules + """Returns a query builder instance for data granules. - Returns: + Returns + ------- a query builder instance for data granules. + """ if earthaccess.__auth__.authenticated: query_builder = DataGranules(earthaccess.__auth__) @@ -283,10 +312,12 @@ def granule_query() -> GranuleQuery: def get_fsspec_https_session() -> AbstractFileSystem: """Returns a fsspec session that can be used to access datafiles across many different DAACs. - Returns: + Returns + ------- An fsspec instance able to access data across DAACs. - Examples: + Examples + -------- ```python import earthaccess @@ -295,6 +326,7 @@ def get_fsspec_https_session() -> AbstractFileSystem: with fs.open(DAAC_GRANULE) as f: f.read(10) ``` + """ session = earthaccess.__store__.get_fsspec_session() return session @@ -305,10 +337,12 @@ def get_requests_https_session() -> requests.Session: This is useful for making requests to restricted URLs, such as data granules or services that require authentication with NASA EDL. - Returns: + Returns + ------- An authenticated requests Session instance. - Examples: + Examples + -------- ```python import earthaccess @@ -318,6 +352,7 @@ def get_requests_https_session() -> requests.Session: data = req_session.get(granule_url, headers = {"Range": "bytes=0-100"}) ``` + """ session = earthaccess.__store__.get_requests_session() return session @@ -330,15 +365,18 @@ def get_s3fs_session( ) -> s3fs.S3FileSystem: """Returns a fsspec s3fs file session for direct access when we are in us-west-2. - Parameters: + Parameters + ---------- daac: Any DAAC short name e.g. NSIDC, GES_DISC provider: Each DAAC can have a cloud provider. If the DAAC is specified, there is no need to use provider. results: A list of results from search_data(). `earthaccess` will use the metadata from CMR to obtain the S3 Endpoint. - Returns: + Returns + ------- An authenticated s3fs session valid for 1 hour. + """ daac = _normalize_location(daac) provider = _normalize_location(provider) @@ -354,8 +392,10 @@ def get_s3fs_session( def get_edl_token() -> str: """Returns the current token used for EDL. - Returns: + Returns + ------- EDL token + """ token = earthaccess.__auth__.token return token diff --git a/earthaccess/auth.py b/earthaccess/auth.py index 9c103325..8007cd57 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -25,8 +25,7 @@ class SessionWithHeaderRedirection(requests.Session): - """ - Requests removes auth headers if the redirect happens outside the + """Requests removes auth headers if the redirect happens outside the original req domain. """ @@ -85,7 +84,8 @@ def login( ) -> Any: """Authenticate with Earthdata login. - Parameters: + Parameters + ---------- strategy: The authentication method. @@ -96,8 +96,10 @@ def login( persist: Will persist credentials in a `.netrc` file. system (Env): the EDL endpoint to log in to Earthdata, defaults to PROD - Returns: + Returns + ------- An instance of Auth. + """ if system is not None: self._set_earthdata_system(system) @@ -191,13 +193,16 @@ def get_s3_credentials( """Gets AWS S3 credentials for a given NASA cloud provider. The easier way is to use the DAAC short name; provider is optional if we know it. - Parameters: + Parameters + ---------- daac: The name of a NASA DAAC, e.g. NSIDC or PODAAC. provider: A valid cloud provider. Each DAAC has a provider code for their cloud distributions. endpoint: Getting the credentials directly from the S3Credentials URL. - Returns: + Returns + ------- A Python dictionary with the temporary AWS S3 credentials. + """ if self.authenticated: session = SessionWithHeaderRedirection(self.username, self.password) @@ -246,11 +251,14 @@ def get_s3_credentials( def get_session(self, bearer_token: bool = True) -> requests.Session: """Returns a new request session instance. - Parameters: + Parameters + ---------- bearer_token: whether to include bearer token - Returns: + Returns + ------- class Session instance with Auth and bearer token headers + """ session = SessionWithHeaderRedirection() if bearer_token and self.authenticated: diff --git a/earthaccess/formatters.py b/earthaccess/formatters.py index d022e9ee..0317ee79 100644 --- a/earthaccess/formatters.py +++ b/earthaccess/formatters.py @@ -7,7 +7,7 @@ def _load_static_files() -> List[str]: - """Load styles""" + """Load styles.""" return [ importlib_resources.files("earthaccess.css").joinpath(fname).read_text("utf8") for fname in STATIC_FILES diff --git a/earthaccess/results.py b/earthaccess/results.py index 6d91ca7e..bfaae2e2 100644 --- a/earthaccess/results.py +++ b/earthaccess/results.py @@ -76,8 +76,10 @@ class DataCollection(CustomDict): def summary(self) -> Dict[str, Any]: """Summary containing short_name, concept-id, file-type, and cloud-info (if cloud-hosted). - Returns: + Returns + ------- A summary of the collection metadata. + """ # we can print only the concept-id @@ -94,29 +96,30 @@ def summary(self) -> Dict[str, Any]: return summary_dict def get_umm(self, umm_field: str) -> Union[str, Dict[str, Any]]: - """ - Parameters: + """Parameters umm_field: Valid UMM item, i.e. `TemporalExtent` - Returns: + Returns + ------- The value of a given field inside the UMM (Unified Metadata Model). + """ if umm_field in self["umm"]: return self["umm"][umm_field] return "" def concept_id(self) -> str: - """ - Returns: - A collection's `concept_id`. - This id is the most relevant search field on granule queries. + """Returns + A collection's `concept_id`. + This id is the most relevant search field on granule queries. + """ return self["meta"]["concept-id"] def data_type(self) -> str: - """ - Returns: - The collection data type, i.e. HDF5, CSV etc., if available. + """Returns + The collection data type, i.e. HDF5, CSV etc., if available. + """ if "ArchiveAndDistributionInformation" in self["umm"]: return str( @@ -127,27 +130,27 @@ def data_type(self) -> str: return "" def version(self) -> str: - """ - Returns: - The collection's version. + """Returns + The collection's version. + """ if "Version" in self["umm"]: return self["umm"]["Version"] return "" def abstract(self) -> str: - """ - Returns: - The abstract of a collection + """Returns + The abstract of a collection + """ if "Abstract" in self["umm"]: return self["umm"]["Abstract"] return "" def landing_page(self) -> str: - """ - Returns: - The first landing page for the collection (can be many), if available. + """Returns + The first landing page for the collection (can be many), if available. + """ links = self._filter_related_links("LANDING PAGE") if len(links) > 0: @@ -155,18 +158,18 @@ def landing_page(self) -> str: return "" def get_data(self) -> List[str]: - """ - Returns: - The GET DATA links (usually a landing page link, a DAAC portal, or an FTP location). + """Returns + The GET DATA links (usually a landing page link, a DAAC portal, or an FTP location). + """ links = self._filter_related_links("GET DATA") return links def s3_bucket(self) -> Dict[str, Any]: - """ - Returns: - The S3 bucket information if the collection has it. - (**cloud hosted collections only**) + """Returns + The S3 bucket information if the collection has it. + (**cloud hosted collections only**) + """ if "DirectDistributionInformation" in self["umm"]: return self["umm"]["DirectDistributionInformation"] @@ -214,9 +217,9 @@ def __init__( self.render_dict = self._filter_fields_(fields) def __repr__(self) -> str: - """ - Returns: - A basic representation of a data granule. + """Returns + A basic representation of a data granule. + """ data_links = [link for link in self.data_links()] rep_str = f""" @@ -229,9 +232,9 @@ def __repr__(self) -> str: return rep_str def _repr_html_(self) -> str: - """ - Returns: - A rich representation for a data granule if we are in a Jupyter notebook. + """Returns + A rich representation for a data granule if we are in a Jupyter notebook. + """ granule_html_repr = _repr_granule_html(self) return granule_html_repr @@ -243,9 +246,9 @@ def get_s3_credentials_endpoint(self) -> Union[str, None]: return None def size(self) -> float: - """ - Returns: - The total size for the granule in MB. + """Returns + The total size for the granule in MB. + """ try: data_granule = self["umm"]["DataGranule"] @@ -286,14 +289,17 @@ def data_links( ) -> List[str]: """Returns the data links from a granule. - Parameters: + Parameters + ---------- access: direct or external. Direct means in-region access for cloud-hosted collections. in_region: True if we are running in us-west-2. It is meant for the store class. - Returns: + Returns + ------- The data links for the requested access type. + """ https_links = self._filter_related_links("GET DATA") s3_links = self._filter_related_links("GET DATA VIA DIRECT ACCESS") @@ -325,9 +331,9 @@ def data_links( return https_links def dataviz_links(self) -> List[str]: - """ - Returns: - The data visualization links, usually the browse images. + """Returns + The data visualization links, usually the browse images. + """ links = self._filter_related_links("GET RELATED VISUALIZATION") return links diff --git a/earthaccess/search.py b/earthaccess/search.py index 13f8c315..5b715188 100644 --- a/earthaccess/search.py +++ b/earthaccess/search.py @@ -33,24 +33,26 @@ def get_results( query: Union[CollectionQuery, GranuleQuery], limit: int = 2000, ) -> List[Any]: - """ - Get all results up to some limit, even if spanning multiple pages. + """Get all results up to some limit, even if spanning multiple pages. ???+ Tip The default page size is 2000, if the supplied value is greater then the Search-After header will be used to iterate across multiple requests until either the limit has been reached or there are no more results. - Parameters: + Parameters + ---------- limit: The number of results to return - Returns: + Returns + ------- query results as a list - Raises: + Raises + ------ RuntimeError: The CMR query failed. - """ + """ page_size = min(limit, 2000) url = query._build_url() @@ -79,11 +81,10 @@ def get_results( class DataCollections(CollectionQuery): - """ - ???+ Info - The DataCollection class queries against - https://cmr.earthdata.nasa.gov/search/collections.umm_json, - the response has to be in umm_json to use the result classes. + """???+ Info + The DataCollection class queries against + https://cmr.earthdata.nasa.gov/search/collections.umm_json, + the response has to be in umm_json to use the result classes. """ _fields: Optional[List[str]] = None @@ -92,9 +93,11 @@ class DataCollections(CollectionQuery): def __init__(self, auth: Optional[Auth] = None, *args: Any, **kwargs: Any) -> None: """Builds an instance of DataCollections to query the CMR. - Parameters: + Parameters + ---------- auth: An authenticated `Auth` instance. This is an optional parameter for queries that need authentication, e.g. restricted datasets. + """ super().__init__(*args, **kwargs) @@ -119,11 +122,14 @@ def hits(self) -> int: making a lightweight query to CMR and inspecting the returned headers. Restricted datasets will always return zero results even if there are results. - Returns: + Returns + ------- The number of results reported by the CMR. - Raises: + Raises + ------ RuntimeError: The CMR query failed. + """ url = self._build_url() @@ -147,16 +153,19 @@ def get(self, limit: int = 2000) -> List[DataCollection]: issue with granules than collections as they can be potentially millions of them. - Parameters: + Parameters + ---------- limit: The number of results to return - Returns: + Returns + ------- Query results as a (possibly empty) list of `DataCollection` instances. - Raises: + Raises + ------ RuntimeError: The CMR query failed. - """ + """ return [ DataCollection(collection, self._fields) for collection in get_results(self.session, self, limit) @@ -177,14 +186,18 @@ def concept_id(self, IDs: Sequence[str]) -> Self: * If providing a tool's concept ID, it will uniquely identify those tools. * If providing a service's concept ID, it will uniquely identify those services. - Parameters: + Parameters + ---------- IDs: ID(s) to search by. Can be provided as a string or list of strings. - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: An ID does not start with a valid prefix. + """ return super().concept_id(IDs) @@ -194,11 +207,14 @@ def keyword(self, text: str) -> Self: a CMR collection record. This allows for searching against fields like summary and science keywords. - Parameters: + Parameters + ---------- text: text to search for - Returns: + Returns + ------- self + """ return super().keyword(text) @@ -210,14 +226,18 @@ def doi(self, doi: str) -> Self: only at the dataset level but not the granule (data) level. We need to search by DOI, grab the concept_id and then get the data. - Parameters: + Parameters + ---------- doi: DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `doi` is not of type `str`. + """ if not isinstance(doi, str): raise TypeError("doi must be of type str") @@ -232,14 +252,18 @@ def instrument(self, instrument: str) -> Self: Not all datasets have an associated instrument. This works only at the dataset level but not the granule (data) level. - Parameters: + Parameters + ---------- instrument (String): instrument of a datasets, e.g. instrument=GEDI - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `instrument` is not of type `str`. + """ if not isinstance(instrument, str): raise TypeError("instrument must be of type str") @@ -255,14 +279,18 @@ def project(self, project: str) -> Self: only at the dataset level but not the granule (data) level. Will return datasets across DAACs matching the project. - Parameters: + Parameters + ---------- project (String): associated project of a datasets, e.g. project=EMIT - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `project` is not of type `str`. + """ if not isinstance(project, str): raise TypeError("project must be of type str") @@ -284,13 +312,16 @@ def parameters(self, **kwargs: Any) -> Self: ) ``` - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: The name of a keyword argument is not the name of a method. TypeError: The value of a keyword argument is not an argument or tuple of arguments matching the number and type(s) of the method's parameters. + """ methods = dict(getmembers(self, predicate=ismethod)) @@ -316,12 +347,15 @@ def print_help(self, method: str = "fields") -> None: def fields(self, fields: Optional[List[str]] = None) -> Self: """Masks the response by only showing the fields included in this list. - Parameters: + Parameters + ---------- fields (List): list of fields to show. These fields come from the UMM model (e.g. Abstract, Title). - Returns: + Returns + ------- self + """ self._fields = fields return self @@ -330,11 +364,14 @@ def debug(self, debug: bool = True) -> Self: """If True, prints the actual query to CMR. Note that the pagination happens in the headers. - Parameters: + Parameters + ---------- debug (Boolean): If `True`, print the CMR query. - Returns: + Returns + ------- self + """ self._debug = debug return self @@ -347,14 +384,18 @@ def cloud_hosted(self, cloud_hosted: bool = True) -> Self: Cloud hosted collections can be public or restricted. Restricted collections will not be matched using this parameter - Parameters: + Parameters + ---------- cloud_hosted: If `True`, obtain only cloud-hosted collections. - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `cloud_hosted` is not of type `bool`. + """ if not isinstance(cloud_hosted, bool): raise TypeError("cloud_hosted must be of type bool") @@ -373,11 +414,14 @@ def provider(self, provider: str) -> Self: E.g., PODAAC is a data center or DAAC; PODAAC is the default provider for on-premises data, POCLOUD is the PODAAC provider for their data in the cloud. - Parameters: + Parameters + ---------- provider: a provider code for any DAAC, e.g. POCLOUD, NSIDC_CPRD, etc. - Returns: + Returns + ------- self + """ self.params["provider"] = provider return self @@ -385,11 +429,14 @@ def provider(self, provider: str) -> Self: def data_center(self, data_center_name: str) -> Self: """An alias for the `daac` method. - Parameters: + Parameters + ---------- data_center_name: DAAC shortname, e.g. NSIDC, PODAAC, GESDISC - Returns: + Returns + ------- self + """ return self.daac(data_center_name) @@ -397,11 +444,14 @@ def daac(self, daac_short_name: str) -> Self: """Only match collections for a given DAAC, by default the on-prem collections for the DAAC. - Parameters: + Parameters + ---------- daac_short_name: a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC - Returns: + Returns + ------- self + """ if "cloud_hosted" in self.params: cloud_hosted = self.params["cloud_hosted"] @@ -427,22 +477,25 @@ def temporal( Using `datetime.datetime(YYYY, MM, DD)` is different, because `datetime.datetime` objects have `00:00:00` as their built-in default. - Parameters: + Parameters + ---------- date_from: start of temporal range date_to: end of temporal range exclude_boundary: whether or not to exclude the date_from/to in the matched range. - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `date_from` or `date_to` is a non-`None` value that is neither a datetime object nor a string that can be parsed as a datetime object; or `date_from` and `date_to` are both datetime objects (or parsable as such) and `date_from` is after `date_to`. - """ + """ return super().temporal(date_from, date_to, exclude_boundary) @@ -475,13 +528,15 @@ def hits(self) -> int: This is done by making a lightweight query to CMR and inspecting the returned headers. - Returns: + Returns + ------- The number of results reported by the CMR. - Raises: + Raises + ------ RuntimeError: The CMR query failed. - """ + """ url = self._build_url() response = self.session.get(url, headers=self.headers, params={"page_size": 0}) @@ -507,14 +562,18 @@ def get(self, limit: int = 2000) -> List[DataGranule]: issue with granules than collections as they can be potentially millions of them. - Parameters: + Parameters + ---------- limit: The number of results to return. - Returns: + Returns + ------- Query results as a (possibly empty) list of `DataGranules` instances. - Raises: + Raises + ------ RuntimeError: The CMR query failed. + """ response = get_results(self.session, self, limit) @@ -537,13 +596,16 @@ def parameters(self, **kwargs: Any) -> Self: ) ``` - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: The name of a keyword argument is not the name of a method. TypeError: The value of a keyword argument is not an argument or tuple of arguments matching the number and type(s) of the method's parameters. + """ methods = {} for name, func in getmembers(self, predicate=ismethod): @@ -570,11 +632,14 @@ def provider(self, provider: str) -> Self: PODAAC is the default provider for on-prem data, and POCLOUD is the PODAAC provider for their data in the cloud. - Parameters: + Parameters + ---------- provider: a provider code for any DAAC, e.g. POCLOUD, NSIDC_CPRD, etc. - Returns: + Returns + ------- self + """ self.params["provider"] = provider return self @@ -582,11 +647,14 @@ def provider(self, provider: str) -> Self: def data_center(self, data_center_name: str) -> Self: """An alias for the `daac` method. - Parameters: + Parameters + ---------- data_center_name (String): DAAC shortname, e.g. NSIDC, PODAAC, GESDISC - Returns: + Returns + ------- self + """ return self.daac(data_center_name) @@ -594,11 +662,14 @@ def daac(self, daac_short_name: str) -> Self: """Only match collections for a given DAAC. Default to on-prem collections for the DAAC. - Parameters: + Parameters + ---------- daac_short_name: a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC - Returns: + Returns + ------- self + """ if "cloud_hosted" in self.params: cloud_hosted = self.params["cloud_hosted"] @@ -621,8 +692,10 @@ def orbit_number( orbit1: orbit to target (lower limit of range when orbit2 is provided) orbit2: upper limit of range - Returns: + Returns + ------- self + """ return super().orbit_number(orbit1, orbit2) @@ -635,14 +708,18 @@ def cloud_hosted(self, cloud_hosted: bool = True) -> Self: Cloud-hosted collections can be public or restricted. Restricted collections will not be matched using this parameter. - Parameters: + Parameters + ---------- cloud_hosted: If `True`, obtain only granules from cloud-hosted collections. - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `cloud_hosted` is not of type `bool`. + """ if not isinstance(cloud_hosted, bool): raise TypeError("cloud_hosted must be of type bool") @@ -663,14 +740,18 @@ def granule_name(self, granule_name: str) -> Self: We can use wildcards on a granule name to further refine our search, e.g. `MODGRNLD.*.daily.*`. - Parameters: + Parameters + ---------- granule_name: granule name (accepts wildcards) - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: if `granule_name` is not of type `str` + """ if not isinstance(granule_name, str): raise TypeError("granule_name must be of type string") @@ -684,15 +765,19 @@ def online_only(self, online_only: bool = True) -> Self: """Only match granules that are listed online and not available for download. The inverse of this method is `downloadable`. - Parameters: + Parameters + ---------- online_only: If `True`, obtain only granules that are online (not downloadable) - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `online_only` is not of type `bool`. + """ return super().online_only(online_only) @@ -700,16 +785,20 @@ def online_only(self, online_only: bool = True) -> Self: def day_night_flag(self, day_night_flag: str) -> Self: """Filter by period of the day the granule was collected during. - Parameters: + Parameters + ---------- day_night_flag: "day", "night", or "unspecified" - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `day_night_flag` is not of type `str`. ValueError: `day_night_flag` is not one of `"day"`, `"night"`, or `"unspecified"`. + """ return super().day_night_flag(day_night_flag) @@ -717,14 +806,18 @@ def day_night_flag(self, day_night_flag: str) -> Self: def instrument(self, instrument: str) -> Self: """Filter by the instrument associated with the granule. - Parameters: + Parameters + ---------- instrument: name of the instrument - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `instrument` is not a non-empty string. + """ return super().instrument(instrument) @@ -732,14 +825,18 @@ def instrument(self, instrument: str) -> Self: def platform(self, platform: str) -> Self: """Filter by the satellite platform the granule came from. - Parameters: + Parameters + ---------- platform: name of the satellite - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `platform` is not a non-empty string. + """ return super().platform(platform) @@ -751,16 +848,20 @@ def cloud_cover( ) -> Self: """Filter by the percentage of cloud cover present in the granule. - Parameters: + Parameters + ---------- min_cover: minimum percentage of cloud cover max_cover: maximum percentage of cloud cover - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `min_cover` or `max_cover` is not convertible to a float, or `min_cover` is greater than `max_cover`. + """ return super().cloud_cover(min_cover, max_cover) @@ -791,11 +892,14 @@ def _is_cloud_hosted(self, granule: Any) -> bool: def short_name(self, short_name: str) -> Self: """Filter by short name (aka product or collection name). - Parameters: + Parameters + ---------- short_name: name of a collection - Returns: + Returns + ------- self + """ return super().short_name(short_name) @@ -803,11 +907,14 @@ def debug(self, debug: bool = True) -> Self: """If True, prints the actual query to CMR, notice that the pagination happens in the headers. - Parameters: + Parameters + ---------- debug: If `True`, print the CMR query. - Returns: + Returns + ------- self + """ self._debug = debug return self @@ -828,21 +935,24 @@ def temporal( Using `datetime.datetime(YYYY, MM, DD)` is different, because `datetime.datetime` objects have `00:00:00` as their built-in default. - Parameters: + Parameters + ---------- date_from: start of temporal range date_to: end of temporal range exclude_boundary: whether to exclude the date_from and date_to in the matched range - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `date_from` or `date_to` is a non-`None` value that is neither a datetime object nor a string that can be parsed as a datetime object; or `date_from` and `date_to` are both datetime objects (or parsable as such) and `date_from` is after `date_to`. - """ + """ return super().temporal(date_from, date_to, exclude_boundary) @override @@ -850,11 +960,14 @@ def version(self, version: str) -> Self: """Filter by version. Note that CMR defines this as a string. For example, MODIS version 6 products must be searched for with "006". - Parameters: + Parameters + ---------- version: version string - Returns: + Returns + ------- self + """ return super().version(version) @@ -862,15 +975,19 @@ def version(self, version: str) -> Self: def point(self, lon: FloatLike, lat: FloatLike) -> Self: """Filter by granules that include a geographic point. - Parameters: + Parameters + ---------- lon: longitude of geographic point lat: latitude of geographic point - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `lon` or `lat` cannot be converted to a float. + """ return super().point(lon, lat) @@ -879,16 +996,20 @@ def polygon(self, coordinates: Sequence[PointLike]) -> Self: """Filter by granules that overlap a polygonal area. Must be used in combination with a collection filtering parameter such as short_name or entry_title. - Parameters: + Parameters + ---------- coordinates: list of (lon, lat) tuples - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `coordinates` is not a sequence of at least 4 coordinate pairs, any of the coordinates cannot be converted to a float, or the first and last coordinate pairs are not equal. + """ return super().polygon(coordinates) @@ -903,17 +1024,21 @@ def bounding_box( """Filter by granules that overlap a bounding box. Must be used in combination with a collection filtering parameter such as short_name or entry_title. - Parameters: + Parameters + ---------- lower_left_lon: lower left longitude of the box lower_left_lat: lower left latitude of the box upper_right_lon: upper right longitude of the box upper_right_lat: upper right latitude of the box - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: A coordinate could not be converted to a float. + """ return super().bounding_box( lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat @@ -925,15 +1050,19 @@ def line(self, coordinates: Sequence[PointLike]) -> Self: in combination with a collection filtering parameter such as short_name or entry_title. - Parameters: + Parameters + ---------- coordinates: a list of (lon, lat) tuples - Returns: + Returns + ------- self - Raises: + Raises + ------ ValueError: `coordinates` is not a sequence of at least 2 coordinate pairs, or a coordinate could not be converted to a float. + """ return super().line(coordinates) @@ -942,14 +1071,18 @@ def downloadable(self, downloadable: bool = True) -> Self: """Only match granules that are available for download. The inverse of this method is `online_only`. - Parameters: + Parameters + ---------- downloadable: If `True`, obtain only granules that are downloadable. - Returns: + Returns + ------- self - Raises: + Raises + ------ TypeError: `downloadable` is not of type `bool`. + """ return super().downloadable(downloadable) @@ -960,16 +1093,19 @@ def doi(self, doi: str) -> Self: Not all datasets have an associated DOI, internally if a DOI is found earthaccess will grab the concept_id for the query to CMR. - Parameters: + Parameters + ---------- doi: DOI of a dataset, e.g. 10.5067/AQR50-3Q7CS - Returns: + Returns + ------- self - Raises: + Raises + ------ RuntimeError: The CMR query to get the collection for the DOI fails. - """ + """ # TODO consider deferring this query until the search is executed collection = DataCollections().doi(doi).get() diff --git a/earthaccess/store.py b/earthaccess/store.py index fa0e8a99..a7103d54 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -95,8 +95,10 @@ class Store(object): def __init__(self, auth: Any, pre_authorize: bool = False) -> None: """Store is the class to access data. - Parameters: + Parameters + ---------- auth: Auth instance to download and access data. + """ if auth.authenticated is True: self.auth = auth @@ -169,13 +171,16 @@ def set_requests_session( This HTTPS session can be used to download granules if we want to use a direct, lower level API. - Parameters: + Parameters + ---------- url: used to test the credentials and populate the class auth cookies method: HTTP method to test, default: "GET" bearer_token: if true, will be used for authenticated queries on CMR - Returns: + Returns + ------- fsspec HTTPFileSystem (aiohttp client session) + """ if not hasattr(self, "_http_session"): self._http_session = self.auth.get_session(bearer_token) @@ -205,13 +210,16 @@ def get_s3fs_session( ) -> s3fs.S3FileSystem: """Returns a s3fs instance for a given cloud provider / DAAC. - Parameters: + Parameters + ---------- daac: any of the DAACs, e.g. NSIDC, PODAAC provider: a data provider if we know them, e.g. PODAAC -> POCLOUD endpoint: pass the URL for the credentials directly - Returns: + Returns + ------- a s3fs file instance + """ if self.auth is None: raise ValueError( @@ -268,8 +276,10 @@ def get_fsspec_session(self) -> fsspec.AbstractFileSystem: This HTTPS session can be used to download granules if we want to use a direct, lower level API. - Returns: + Returns + ------- fsspec HTTPFileSystem (aiohttp client session) + """ token = self.auth.token["access_token"] client_kwargs = { @@ -286,11 +296,14 @@ def get_requests_session(self, bearer_token: bool = True) -> requests.Session: This HTTPS session can be used to download granules if we want to use a direct, lower level API. - Parameters: + Parameters + ---------- bearer_token: if true, will be used for authenticated queries on CMR - Returns: + Returns + ------- requests Session + """ return self.auth.get_session() @@ -302,13 +315,16 @@ def open( """Returns a list of fsspec file-like objects that can be used to access files hosted on S3 or HTTPS by third party libraries like xarray. - Parameters: + Parameters + ---------- granules: a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule provider: an option - Returns: + Returns + ------- A list of s3fs "file pointers" to s3 files. + """ if len(granules): return self._open(granules, provider) @@ -323,13 +339,16 @@ def _open( """Returns a list of fsspec file-like objects that can be used to access files hosted on S3 or HTTPS by third party libraries like xarray. - Parameters: + Parameters + ---------- granules: a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule provider: an option - Returns: + Returns + ------- A list of s3fs "file pointers" to s3 files. + """ raise NotImplementedError("granules should be a list of DataGranule or URLs") @@ -458,15 +477,18 @@ def get( * If we request data granules from an on-prem collection, the data will be effectively downloaded to a local directory. - Parameters: + Parameters + ---------- granules: A list of granules(DataGranule) instances or a list of granule links (HTTP). local_path: Local directory to store the remote data granules. provider: a valid cloud provider, each DAAC has a provider code for their cloud distributions threads: Parallel number of threads to use to download the files; adjust as necessary, default = 8. - Returns: + Returns + ------- List of downloaded files + """ if local_path is None: today = datetime.datetime.today().strftime("%Y-%m-%d") @@ -498,15 +520,18 @@ def _get( * If we request data granules from an on-prem collection, the data will be effectively downloaded to a local directory. - Parameters: + Parameters + ---------- granules: A list of granules (DataGranule) instances or a list of granule links (HTTP). local_path: Local directory to store the remote data granules provider: a valid cloud provider, each DAAC has a provider code for their cloud distributions threads: Parallel number of threads to use to download the files; adjust as necessary, default = 8. - Returns: + Returns + ------- None + """ raise NotImplementedError(f"Cannot _get {granules}") @@ -592,12 +617,15 @@ def _get_granules( def _download_file(self, url: str, directory: Path) -> str: """Download a single file from an on-prem location, a DAAC data center. - Parameters: + Parameters + ---------- url: the granule url directory: local directory - Returns: + Returns + ------- A local filepath or an exception. + """ # If the get data link is an Opendap location if "opendap" in url and url.endswith(".html"): @@ -629,14 +657,17 @@ def _download_onprem_granules( ) -> List[Any]: """Downloads a list of URLS into the data directory. - Parameters: + Parameters + ---------- urls: list of granule URLs from an on-prem collection directory: local directory to store the downloaded files threads: parallel number of threads to use to download the files; adjust as necessary, default = 8 - Returns: + Returns + ------- A list of local filepaths to which the files were downloaded. + """ if urls is None: raise ValueError("The granules didn't provide a valid GET DATA link") diff --git a/pyproject.toml b/pyproject.toml index b4b6a4a4..fb261ca0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,7 +126,7 @@ src = ["earthaccess", "stubs", "tests"] [tool.ruff.lint] extend-select = ["I", "T20", "D"] - +ignore = ["D1", "D205", "D401", "D417"] [tool.ruff.lint.isort] combine-as-imports = true diff --git a/tests/integration/test_cloud_download.py b/tests/integration/test_cloud_download.py index a9b9432c..4e8f9519 100644 --- a/tests/integration/test_cloud_download.py +++ b/tests/integration/test_cloud_download.py @@ -72,9 +72,8 @@ def get_sample_granules(granules, sample_size, max_granule_size): - """ - returns a list with sample granules and their size in MB if - the total size is less than the max_granule_size + """Returns a list with sample granules and their size in MB if + the total size is less than the max_granule_size. """ files_to_download = [] total_size = 0 @@ -98,9 +97,7 @@ def get_sample_granules(granules, sample_size, max_granule_size): @pytest.mark.parametrize("daac", daac_list) def test_earthaccess_can_download_cloud_collection_granules(daac): - """ - Tests that we can download cloud collections using HTTPS links - """ + """Tests that we can download cloud collections using HTTPS links.""" daac_shortname = daac["short_name"] collections_count = daac["collections_count"] collections_sample_size = daac["collections_sample_size"] diff --git a/tests/integration/test_cloud_open.py b/tests/integration/test_cloud_open.py index 78050f22..b69eba15 100644 --- a/tests/integration/test_cloud_open.py +++ b/tests/integration/test_cloud_open.py @@ -71,9 +71,8 @@ def get_sample_granules(granules, sample_size, max_granule_size): - """ - returns a list with sample granules and their size in MB if - the total size is less than the max_granule_size + """Returns a list with sample granules and their size in MB if + the total size is less than the max_granule_size. """ files_to_download = [] total_size = 0 @@ -104,9 +103,7 @@ def supported_collection(data_links): @pytest.mark.parametrize("daac", daacs_list) def test_earthaccess_can_open_onprem_collection_granules(daac): - """ - Tests that we can download cloud collections using HTTPS links - """ + """Tests that we can download cloud collections using HTTPS links.""" daac_shortname = daac["short_name"] collections_count = daac["collections_count"] collections_sample_size = daac["collections_sample_size"] diff --git a/tests/integration/test_onprem_download.py b/tests/integration/test_onprem_download.py index d54404c8..242a3c26 100644 --- a/tests/integration/test_onprem_download.py +++ b/tests/integration/test_onprem_download.py @@ -64,9 +64,8 @@ def get_sample_granules(granules, sample_size, max_granule_size): - """ - returns a list with sample granules and their size in MB if - the total size is less than the max_granule_size + """Returns a list with sample granules and their size in MB if + the total size is less than the max_granule_size. """ files_to_download = [] total_size = 0 @@ -97,9 +96,7 @@ def supported_collection(data_links): @pytest.mark.parametrize("daac", daacs_list) def test_earthaccess_can_download_onprem_collection_granules(daac): - """ - Tests that we can download cloud collections using HTTPS links - """ + """Tests that we can download cloud collections using HTTPS links.""" daac_shortname = daac["short_name"] collections_count = daac["collections_count"] collections_sample_size = daac["collections_sample_size"] diff --git a/tests/integration/test_onprem_open.py b/tests/integration/test_onprem_open.py index 02a2c60a..2a455c44 100644 --- a/tests/integration/test_onprem_open.py +++ b/tests/integration/test_onprem_open.py @@ -63,9 +63,8 @@ def get_sample_granules(granules, sample_size, max_granule_size): - """ - returns a list with sample granules and their size in MB if - the total size is less than the max_granule_size + """Returns a list with sample granules and their size in MB if + the total size is less than the max_granule_size. """ files_to_download = [] total_size = 0 @@ -96,9 +95,7 @@ def supported_collection(data_links): @pytest.mark.parametrize("daac", daacs_list) def test_earthaccess_can_open_onprem_collection_granules(daac): - """ - Tests that we can download cloud collections using HTTPS links - """ + """Tests that we can download cloud collections using HTTPS links.""" daac_shortname = daac["short_name"] collections_count = daac["collections_count"] collections_sample_size = daac["collections_sample_size"] diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index c669e8d7..b55fd32f 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -14,8 +14,7 @@ def unique_results(results): - """ - When we invoke a search request multiple times we want to ensure that we don't + """When we invoke a search request multiple times we want to ensure that we don't get the same results back. This is a one shot test as the results are preserved by VCR but still useful. """ @@ -120,10 +119,9 @@ def test_data_links(self): ) def test_get_more_than_2000(self): - """ - If we execute a get with a limit of more than 2000 + """If we execute a get with a limit of more than 2000 then we expect multiple invocations of a cmr granule search and - to not fetch back more results than we ask for + to not fetch back more results than we ask for. """ granules = earthaccess.search_data(short_name="MOD02QKM", count=3000) @@ -133,10 +131,9 @@ def test_get_more_than_2000(self): self.assertTrue(unique_results(granules)) def test_get(self): - """ - If we execute a get with no arguments then we expect + """If we execute a get with no arguments then we expect to get the maximum no. of granules from a single CMR call (2000) - in a single request + in a single request. """ granules = earthaccess.search_data(short_name="MOD02QKM", count=2000) @@ -146,10 +143,9 @@ def test_get(self): self.assertTrue(unique_results(granules)) def test_get_all_less_than_2k(self): - """ - If we execute a get_all then we expect multiple + """If we execute a get_all then we expect multiple invocations of a cmr granule search and - to not fetch back more results than we ask for + to not fetch back more results than we ask for. """ granules = earthaccess.search_data( short_name="TELLUS_GRAC_L3_JPL_RL06_LND_v04", count=2000 @@ -161,10 +157,9 @@ def test_get_all_less_than_2k(self): self.assertTrue(unique_results(granules)) def test_get_all_more_than_2k(self): - """ - If we execute a get_all then we expect multiple + """If we execute a get_all then we expect multiple invocations of a cmr granule search and - to not fetch back more results than we ask for + to not fetch back more results than we ask for. """ granules = earthaccess.search_data( short_name="CYGNSS_NOAA_L2_SWSP_25KM_V1.2", count=3000 @@ -182,10 +177,9 @@ def test_get_all_more_than_2k(self): self.assertTrue(unique_results(granules)) def test_collections_less_than_2k(self): - """ - If we execute a get_all then we expect multiple + """If we execute a get_all then we expect multiple invocations of a cmr granule search and - to not fetch back more results than we ask for + to not fetch back more results than we ask for. """ query = DataCollections().daac("PODAAC").cloud_hosted(True) collections = query.get(20) @@ -197,10 +191,9 @@ def test_collections_less_than_2k(self): self.assert_is_using_search_after(self.cassette) def test_collections_more_than_2k(self): - """ - If we execute a get_all then we expect multiple + """If we execute a get_all then we expect multiple invocations of a cmr granule search and - to not fetch back more results than we ask for + to not fetch back more results than we ask for. """ query = DataCollections() collections = query.get(3000)