diff --git a/doc/source/user_guide/documentation/classes_dev_uml.svg b/doc/source/user_guide/documentation/classes_dev_uml.svg index 09c112f5c..84fb2a450 100644 --- a/doc/source/user_guide/documentation/classes_dev_uml.svg +++ b/doc/source/user_guide/documentation/classes_dev_uml.svg @@ -235,7 +235,7 @@ Read -_filelist : list +_filelist _out_obj : Dataset _product _read_vars diff --git a/doc/source/user_guide/documentation/components.rst b/doc/source/user_guide/documentation/components.rst index 76ffcbb2d..7d81c190c 100644 --- a/doc/source/user_guide/documentation/components.rst +++ b/doc/source/user_guide/documentation/components.rst @@ -27,14 +27,6 @@ granules :undoc-members: :show-inheritance: -is2cat ------- - -.. automodule:: icepyx.core.is2cat - :members: - :undoc-members: - :show-inheritance: - is2ref ------ diff --git a/doc/source/user_guide/documentation/query.rst b/doc/source/user_guide/documentation/query.rst index 804de27e5..df82aa35b 100644 --- a/doc/source/user_guide/documentation/query.rst +++ b/doc/source/user_guide/documentation/query.rst @@ -23,7 +23,6 @@ Attributes Query.cycles Query.dates Query.end_time - Query.file_vars Query.granules Query.order_vars Query.product diff --git a/icepyx/core/is2ref.py b/icepyx/core/is2ref.py index c51c631be..66ceaec1c 100644 --- a/icepyx/core/is2ref.py +++ b/icepyx/core/is2ref.py @@ -378,8 +378,10 @@ def extract_product(filepath, auth=None): # ATL14 saves the short_name as an array ['ATL14'] product = product[0] product = _validate_product(product) - except KeyError: - raise "Unable to parse the product name from file metadata" + except KeyError as e: + raise Exception( + "Unable to parse the product name from file metadata" + ).with_traceback(e.__traceback__) # Close the file reader f.close() @@ -421,8 +423,10 @@ def extract_version(filepath, auth=None): if isinstance(version, bytes): version = version.decode() - except KeyError: - raise "Unable to parse the version from file metadata" + except KeyError as e: + raise Exception( + "Unable to parse the version from file metadata" + ).with_traceback(e.__traceback__) # Close the file reader f.close() diff --git a/icepyx/core/read.py b/icepyx/core/read.py index e11015935..8cc5afd21 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -1,4 +1,3 @@ -import fnmatch import glob import os import sys @@ -36,7 +35,8 @@ def _make_np_datetime(df, keyword): Example ------- - >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, coords={"time_idx": [0]}) + >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, + ... coords={"time_idx": [0]}) >>> _make_np_datetime(ds, "time") Dimensions: (time_idx: 1) @@ -48,7 +48,8 @@ def _make_np_datetime(df, keyword): """ if df[keyword].str.endswith("Z"): - # manually remove 'Z' from datetime to allow conversion to np.datetime64 object (support for timezones is deprecated and causes a seg fault) + # manually remove 'Z' from datetime to allow conversion to np.datetime64 object + # (support for timezones is deprecated and causes a seg fault) df.update({keyword: df[keyword].str[:-1].astype(np.datetime64)}) else: @@ -100,165 +101,51 @@ def _get_track_type_str(grp_path) -> (str, str, str): return track_str, spot_dim_name, spot_var_name -# Dev note: function fully tested (except else, which don't know how to get to) -def _check_datasource(filepath): +def _parse_source(data_source, glob_kwargs={}) -> list: """ - Determine if the input is from a local system or is an s3 bucket. - Then, validate the inputs (for those on the local system; s3 sources are not validated currently) - """ - - from pathlib import Path - - import fsspec - from fsspec.implementations.local import LocalFileSystem - - source_types = ["is2_local", "is2_s3"] - - if not isinstance(filepath, Path) and not isinstance(filepath, str): - raise TypeError("filepath must be a string or Path") - - fsmap = fsspec.get_mapper(str(filepath)) - output_fs = fsmap.fs - - if "s3" in output_fs.protocol: - return source_types[1] - elif isinstance(output_fs, LocalFileSystem): - assert _validate_source(filepath) - return source_types[0] - else: - raise ValueError("Could not confirm the datasource type.") + Parse the user's data_source input based on type. - """ - Could also use: os.path.splitext(f.name)[1].lower() to get file extension - - If ultimately want to handle mixed types, save the valid paths in a dict with "s3" or "local" as the keys and the list of the files as the values. - Then the dict can also contain a catalog key with a dict of catalogs for each of those types of inputs ("s3" or "local") - In general, the issue we'll run into with multiple files is going to be merging during the read in, - so it could be beneficial to not hide this too much and mandate users handle this intentionally outside the read in itself. - - this function was derived with some of the following resources, based on echopype - https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py - - https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=get_map#fsspec.spec.AbstractFileSystem.glob - - https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/implementations/local.html - - https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/convert/api.py#L380 - - https://echopype.readthedocs.io/en/stable/convert.html - """ - - -# Dev note: function fully tested as currently written -def _validate_source(source): - """ - Check that the entered data source paths on the local file system are valid - - Currently, s3 data source paths are not validated. - """ - - # acceptable inputs (for now) are a single file or directory - # would ultimately like to make a Path (from pathlib import Path; isinstance(source, Path)) an option - # see https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py#L82 - assert isinstance(source, str), "You must enter your input as a string." - assert ( - os.path.isdir(source) is True or os.path.isfile(source) is True - ), "Your data source string is not a valid data source." - return True - - -# Dev Note: function is tested (at least loosely) -def _run_fast_scandir(dir, fn_glob): - """ - Quickly scan nested directories to get a list of filenames that match the fn_glob string. - Modified from https://stackoverflow.com/a/59803793/2441026 - (faster than os.walk or glob methods, and allows filename matching in subdirectories). - - Parameters - ---------- - dir : str - full path to the input directory - - fn_glob : str - glob-style filename pattern - - Outputs + Returns ------- - subfolders : list - list of strings of all nested subdirectories - - files : list - list of strings containing full paths to each file matching the filename pattern - """ - - subfolders, files = [], [] - - for f in os.scandir(dir): - if any(f.name.startswith(s) for s in ["__", "."]): - continue - if f.is_dir(): - subfolders.append(f.path) - if f.is_file(): - if fnmatch.fnmatch(f.name, fn_glob): - files.append(f.path) - - for dir in list(subfolders): - sf, f = _run_fast_scandir(dir, fn_glob) - subfolders.extend(sf) - files.extend(f) - - return subfolders, files - - -# Need to post on intake's page to see if this would be a useful contribution... -# https://github.com/intake/intake/blob/0.6.4/intake/source/utils.py#L216 -def _pattern_to_glob(pattern): + filelist : list of str + List of granule (filenames) to be read in """ - Adapted from intake.source.utils.path_to_glob to convert a path as pattern into a glob style path - that uses the pattern's indicated number of '?' instead of '*' where an int was specified. - Returns pattern if pattern is not a string. - - Parameters - ---------- - pattern : str - Path as pattern optionally containing format_strings + from pathlib import Path - Returns - ------- - glob_path : str - Path with int format strings replaced with the proper number of '?' and '*' otherwise. + if isinstance(data_source, list): + assert [isinstance(f, (str, Path)) for f in data_source] + # if data_source is a list pass that directly to _filelist + filelist = data_source + elif os.path.isdir(data_source): + # if data_source is a directory glob search the directory and assign to _filelist + data_source = os.path.join(data_source, "*") + filelist = glob.glob(data_source, **glob_kwargs) + elif isinstance(data_source, str) or isinstance(data_source, Path): + if data_source.startswith("s3"): + # if the string is an s3 path put it in the _filelist without globbing + filelist = [data_source] + else: + # data_source is a globable string + filelist = glob.glob(data_source, **glob_kwargs) + else: + raise TypeError( + "data_source should be a list of files, a directory, the path to a file, " + "or a glob string." + ) - Examples - -------- - >>> _pattern_to_glob('{year}/{month}/{day}.csv') - '*/*/*.csv' - >>> _pattern_to_glob('{year:4}/{month:2}/{day:2}.csv') - '????/??/??.csv' - >>> _pattern_to_glob('data/{year:4}{month:02}{day:02}.csv') - 'data/????????.csv' - >>> _pattern_to_glob('data/*.csv') - 'data/*.csv' - """ - from string import Formatter + # Remove any directories from the list (these get generated during recursive + # glob search) + filelist = [f for f in filelist if not os.path.isdir(f)] - if not isinstance(pattern, str): - return pattern + # Make sure a non-zero number of files were found + if len(filelist) == 0: + raise KeyError( + "No files found matching the specified `data_source`. Check your glob " + "string or file list." + ) - fmt = Formatter() - glob_path = "" - # prev_field_name = None - for literal_text, field_name, format_specs, _ in fmt.parse(format_string=pattern): - glob_path += literal_text - if field_name and (glob_path != "*"): - try: - glob_path += "?" * int(format_specs) - except ValueError: - glob_path += "*" - # alternatively, you could use bits=utils._get_parts_of_format_string(resolved_string, literal_texts, format_specs) - # and then use len(bits[i]) to get the length of each format_spec - # print(glob_path) - return glob_path + return filelist def _confirm_proceed(): @@ -282,8 +169,8 @@ class Read(EarthdataAuthMixin): Parameters ---------- - data_source : string, List - A string or list which specifies the files to be read. + data_source : string, Path, List + A string, pathlib.Path object, or list which specifies the files to be read. The string can be either: 1) the path of a single file 2) the path to a directory or @@ -291,7 +178,8 @@ class Read(EarthdataAuthMixin): The List must be a list of strings, each of which is the path of a single file. glob_kwargs : dict, default {} - Additional arguments to be passed into the [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function + Additional arguments to be passed into the + [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function out_obj_type : object, default xarray.Dataset The desired format for the data to be read in. @@ -326,7 +214,8 @@ class Read(EarthdataAuthMixin): Reading all files in a directory >>> ipx.Read('/path/to/data/') # doctest: +SKIP - Reading files that match a particular pattern (here, all .h5 files that start with `processed_ATL06_`). + Reading files that match a particular pattern + (here, all .h5 files that start with `processed_ATL06_`). >>> ipx.Read('/path/to/data/processed_ATL06_*.h5') # doctest: +SKIP Reading a specific list of files @@ -370,29 +259,7 @@ def __init__( "Please use the `data_source` argument to specify your dataset instead." ) - if isinstance(data_source, list): - # if data_source is a list pass that directly to _filelist - self._filelist = data_source - elif os.path.isdir(data_source): - # if data_source is a directory glob search the directory and assign to _filelist - data_source = os.path.join(data_source, "*") - self._filelist = glob.glob(data_source, **glob_kwargs) - elif isinstance(data_source, str): - if data_source.startswith("s3"): - # if the string is an s3 path put it in the _filelist without globbing - self._filelist = [data_source] - else: - # data_source is a globable string - self._filelist = glob.glob(data_source, **glob_kwargs) - else: - raise TypeError( - "data_source should be a list of files, a directory, the path to a file, " - "or a glob string." - ) - - # Remove any directories from the list (these get generated during recursive - # glob search) - self._filelist = [f for f in self._filelist if not os.path.isdir(f)] + self._filelist = _parse_source(data_source, glob_kwargs) # Create a dictionary of the products as read from the metadata product_dict = {} @@ -423,7 +290,7 @@ def __init__( ) _confirm_proceed() - # Raise warnings or errors for multiple products or products not matching the user-specified product + # Raise error if multiple products given all_products = list(set(product_dict.values())) if len(all_products) > 1: raise TypeError( @@ -431,14 +298,9 @@ def __init__( "Please provide a valid `data_source` parameter indicating files of a single " "product" ) - elif len(all_products) == 0: - raise TypeError( - "No files found matching the specified `data_source`. Check your glob " - "string or file list." - ) - else: - # Assign the identified product to the property - self._product = all_products[0] + + # Assign the identified product to the property + self._product = all_products[0] if out_obj_type is not None: print( @@ -454,7 +316,8 @@ def __init__( def vars(self): """ Return the variables object associated with the data being read in. - This instance is generated from the source file or first file in a list of input files (when source is a directory). + This instance is generated from the source file or first file in a list of input files + (when source is a directory). See Also -------- @@ -507,7 +370,8 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict): the second list contains the second portion of the group name, etc. "none" is used to fill in where paths are shorter than the longest path. wanted_dict : dict - Dictionary with variable names as keys and a list of group + variable paths containing those variables as values. + Dictionary with variable names as keys and a list of group + + variable paths containing those variables as values. Returns ------- @@ -600,7 +464,8 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict): ) ) - # for the subgoups where there is 1d delta time data, make sure that the cycle number is still a coordinate for merging + # for the subgoups where there is 1d delta time data, + # make sure that the cycle number is still a coordinate for merging try: ds = ds.assign_coords( { @@ -643,14 +508,16 @@ def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict): grp_path : str hdf5 group path read into ds wanted_dict : dict - Dictionary with variable names as keys and a list of group + variable paths containing those variables as values. + Dictionary with variable names as keys and a list of group + + variable paths containing those variables as values. Returns ------- Xarray Dataset with variables from the ds variable group added. """ - # Dev Goal: improve this type of iterating to minimize amount of looping required. Would a path handling library be useful here? + # Dev Goal: improve this type of iterating to minimize amount of looping required. + # Would a path handling library be useful here? grp_spec_vars = [ k for k, v in wanted_dict.items() if any(f"{grp_path}/{k}" in x for x in v) ] @@ -682,7 +549,8 @@ def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict): def load(self): """ - Create a single Xarray Dataset containing the data from one or more files and/or ground tracks. + Create a single Xarray Dataset containing the data from one or more + files and/or ground tracks. Uses icepyx's ICESat-2 data product awareness and Xarray's `combine_by_coords` function. All items in the wanted variables list will be loaded from the files into memory. @@ -778,8 +646,6 @@ def _build_dataset_template(self, file): It may be possible to expand this function to provide multiple templates. """ - # NOTE: use the hdf5 library to grab the attr for the product specifier - # can ultimately then use it to check against user specified one or merge strategies (or to return a list of ds) is2ds = xr.Dataset( coords=dict( @@ -798,7 +664,8 @@ def _read_single_grp(self, file, grp_path): ---------- file : str Full path to ICESat-2 data file. - Currently tested for locally downloaded files; untested but hopefully works for s3 stored cloud files. + Currently tested for locally downloaded files; + untested but hopefully works for s3 stored cloud files. grp_path : str Full string to a variable group. E.g. 'gt1l/land_ice_segments' @@ -818,23 +685,27 @@ def _read_single_grp(self, file, grp_path): def _build_single_file_dataset(self, file, groups_list): """ - Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url. + Create a single xarray dataset with all of the wanted variables/groups + from the wanted var list for a single data file/url. Parameters ---------- file : str Full path to ICESat-2 data file. - Currently tested for locally downloaded files; untested but hopefully works for s3 stored cloud files. + Currently tested for locally downloaded files; + untested but hopefully works for s3 stored cloud files. groups_list : list of strings List of full paths to data variables within the file. - e.g. ['orbit_info/sc_orient', 'gt1l/land_ice_segments/h_li', 'gt1l/land_ice_segments/latitude', 'gt1l/land_ice_segments/longitude'] + e.g. ['orbit_info/sc_orient', 'gt1l/land_ice_segments/h_li', + 'gt1l/land_ice_segments/latitude', 'gt1l/land_ice_segments/longitude'] Returns ------- Xarray Dataset """ - # DEVNOTE: if and elif does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds + # DEVNOTE: if and elif does not actually apply wanted variable list, + # and has not been tested for merging multiple files into one ds # if a gridded product # TODO: all products need to be tested, and quicklook products added or explicitly excluded # Level 3b, gridded (netcdf): ATL14, 15, 16, 17, 18, 19, 20, 21 @@ -861,13 +732,14 @@ def _build_single_file_dataset(self, file, groups_list): ) wanted_groups_set = set(wanted_groups) - # orbit_info is used automatically as the first group path so the info is available for the rest of the groups + # orbit_info is used automatically as the first group path + # so the info is available for the rest of the groups # wanted_groups_set.remove("orbit_info") wanted_groups_set.remove("ancillary_data") # Note: the sorting is critical for datasets with highly nested groups wanted_groups_list = ["ancillary_data"] + sorted(wanted_groups_set) - # returns the wanted groups as a list of lists with group path string elements separated + # returns wanted groups as a list of lists with group path string elements separated _, wanted_groups_tiered = Variables.parse_var_list( groups_list, tiered=True, tiered_vars=True ) @@ -892,14 +764,15 @@ def _build_single_file_dataset(self, file, groups_list): groups_list, tiered=False ) wanted_groups_set = set(wanted_groups) - # orbit_info is used automatically as the first group path so the info is available for the rest of the groups + # orbit_info is used automatically as the first group path + # so the info is available for the rest of the groups wanted_groups_set.remove("orbit_info") wanted_groups_set.remove("ancillary_data") # Note: the sorting is critical for datasets with highly nested groups wanted_groups_list = ["orbit_info", "ancillary_data"] + sorted( wanted_groups_set ) - # returns the wanted groups as a list of lists with group path string elements separated + # returns wanted groups as a list of lists with group path string elements separated _, wanted_groups_tiered = Variables.parse_var_list( groups_list, tiered=True, tiered_vars=True ) @@ -912,7 +785,8 @@ def _build_single_file_dataset(self, file, groups_list): is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict ) - # if there are any deeper nested variables, get those so they have actual coordinates and add them + # if there are any deeper nested variables, + # get those so they have actual coordinates and add them # this may apply to (at a minimum): ATL08 if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list): for grp_path2 in wanted_groups_list: diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 67b29b598..20807c410 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -1,97 +1,85 @@ import pytest -from icepyx.core.read import Read import icepyx.core.read as read -def test_check_datasource_type(): - ermesg = "filepath must be a string or Path" +# note isdir will issue a TypeError if a tuple is passed +def test_parse_source_bad_input_type(): + ermesg = ( + "data_source should be a list of files, a directory, the path to a file, " + "or a glob string." + ) with pytest.raises(TypeError, match=ermesg): - read._check_datasource(246) - - -@pytest.mark.parametrize( - "filepath, expect", - [ - ("./", "is2_local"), - ( - """s3://nsidc-cumulus-prod-protected/ATLAS/ - ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", - "is2_s3", - ), - ], -) -def test_check_datasource(filepath, expect): - source_type = read._check_datasource(filepath) - assert source_type == expect - - -# not sure what I could enter here would get to the else... -# def test_unknown_datasource_type(): -# ermesg = "Could not confirm the datasource type." -# with pytest.raises(ValueError, match=ermesg): -# read._check_datasource("") - + read._parse_source(150) + read._parse_source({"myfiles": "./my_valid_path/file.h5"}) -def test_validate_source_str_given_as_list(): - ermesg = "You must enter your input as a string." - with pytest.raises(AssertionError, match=ermesg): - read._validate_source(["/path/to/valid/ATL06_file.py"]) - -def test_validate_source_str_not_a_dir_or_file(): - ermesg = "Your data source string is not a valid data source." - with pytest.raises(AssertionError, match=ermesg): - read._validate_source("./fake/dirpath") - read._validate_source("./fake_file.h5") +def test_parse_source_no_files(): + ermesg = ( + "No files found matching the specified `data_source`. Check your glob " + "string or file list." + ) + with pytest.raises(KeyError, match=ermesg): + read._parse_source("./icepyx/bogus_glob") @pytest.mark.parametrize( - "dir, fn_glob, expect", + "source, expect", [ - ( - "./icepyx/", - "is2*.py", - ( - sorted( - [ - "./icepyx/core", - "./icepyx/quest", - "./icepyx/quest/dataset_scripts", - "./icepyx/tests", - ] - ), - sorted( - [ - "./icepyx/core/is2ref.py", - "./icepyx/tests/is2class_query.py", - ] - ), + ( # check list input + [ + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ], + sorted( + [ + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ] ), ), - ( - "./icepyx/core", - "is2*.py", - ([], ["./icepyx/core/is2ref.py"]), + ( # check dir input + "./examples", + [ + "./examples/README.md", + ], ), - ( - "./icepyx", - "bogus_glob", - ( + ( # check filename string with glob pattern input + "./icepyx/**/is2*.py", + sorted( [ - "./icepyx/core", - "./icepyx/quest", - "./icepyx/quest/dataset_scripts", - "./icepyx/tests", - ], - [], + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ] + ), + ), + ( # check filename string without glob pattern input + "./icepyx/core/is2ref.py", + [ + "./icepyx/core/is2ref.py", + ], + ), + ( # check s3 filename string + ( + "s3://nsidc-cumulus-prod-protected/ATLAS/" + "ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5" ), + [ + ( + "s3://nsidc-cumulus-prod-protected/ATLAS/" + "ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5" + ), + ], + ), + ( + "./icepyx/core/is2*.py", + ["./icepyx/core/is2ref.py"], ), ], ) -def test_check_run_fast_scandir(dir, fn_glob, expect): - (subfolders, files) = read._run_fast_scandir(dir, fn_glob) - assert (sorted(subfolders), sorted(files)) == expect +def test_parse_source(source, expect): + filelist = read._parse_source(source, glob_kwargs={"recursive": True}) + assert (sorted(filelist)) == expect @pytest.mark.parametrize( @@ -114,18 +102,3 @@ def test_get_track_type_str( exp_spot_dim_name, exp_spot_var_name, ) - - -# Best way to test this may be by including a small sample file with the repo -# (which can be used for testing some of the catalog/read-in functions as well) -# def test_invalid_filename_pattern_in_file(): -# ermesg = "Your input filename does not match the specified pattern." -# default_pattern = Read("/path/to/valid/source/file")._filename_pattern -# with pytest.raises(AssertionError, match=ermesg): -# read._validate_source('/valid/filepath/with/non-default/filename/pattern.h5', default_pattern) - -# def test_invalid_filename_pattern_in_dir(): -# ermesg = "None of your filenames match the specified pattern." -# default_pattern = Read("/path/to/valid/dir/")._filename_pattern -# with pytest.raises(AssertionError, match=ermesg): -# read._validate_source('/valid/dirpath/with/non-default/filename/pattern.h5', default_pattern) diff --git a/icepyx/tests/test_validate_inputs.py b/icepyx/tests/test_validate_inputs.py index 0b5f2f2eb..4d0ea0bd5 100644 --- a/icepyx/tests/test_validate_inputs.py +++ b/icepyx/tests/test_validate_inputs.py @@ -1,7 +1,4 @@ import pytest -import warnings -import datetime as dt -import numpy as np import icepyx.core.validate_inputs as val @@ -70,3 +67,35 @@ def test_tracks_valid(): val.tracks(1388) # check that warning message matches expected assert record[0].message.args[0] == expmsg + + +@pytest.mark.parametrize( + "filepath, expect", + [ + ("./", "./"), + ( + """s3://nsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", + """s3://nsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", + ), + ], +) +def test_check_s3bucket(filepath, expect): + verified_path = val.check_s3bucket(filepath) + assert verified_path == expect + + +def test_wrong_s3bucket(): + filepath = """s3://notnsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""" + + expmsg = ( + "s3 data being read from outside the NSIDC data bucket. Icepyx can " + "read this data, but available data lists may not be accurate." + ) + + with pytest.warns(UserWarning) as record: + val.check_s3bucket(filepath) + + assert record[0].message.args[0] == expmsg