From 07afb1d57bb93d72c623f7a48d9e84956bf2f7dc Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Fri, 26 Jan 2024 15:29:28 -0500 Subject: [PATCH 01/22] convert _validate_source() to _parse_source() and update tests --- icepyx/core/read.py | 106 ++++++++++++++------------------------ icepyx/tests/test_read.py | 46 ++++++----------- 2 files changed, 53 insertions(+), 99 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index e11015935..e4289de56 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -149,9 +149,44 @@ def _check_datasource(filepath): """ -# Dev note: function fully tested as currently written -def _validate_source(source): +def _parse_source(data_source, glob_kwargs) -> list: """ + Parse the data_source input based on type. + + Returns + ------- + filelist : list of str + List of granule (filenames) to be read in + """ + + if isinstance(data_source, list): + # if data_source is a list pass that directly to _filelist + filelist = data_source + elif os.path.isdir(data_source): + # if data_source is a directory glob search the directory and assign to _filelist + data_source = os.path.join(data_source, "*") + filelist = glob.glob(data_source, **glob_kwargs) + elif isinstance(data_source, str): + if data_source.startswith("s3"): + # if the string is an s3 path put it in the _filelist without globbing + filelist = [data_source] + else: + # data_source is a globable string + filelist = glob.glob(data_source, **glob_kwargs) + else: + raise TypeError( + "data_source should be a list of files, a directory, the path to a file, " + "or a glob string." + ) + + # Remove any directories from the list (these get generated during recursive + # glob search) + filelist = [f for f in filelist if not os.path.isdir(f)] + + return filelist + + """ + Check that the entered data source paths on the local file system are valid Currently, s3 data source paths are not validated. @@ -167,49 +202,6 @@ def _validate_source(source): return True -# Dev Note: function is tested (at least loosely) -def _run_fast_scandir(dir, fn_glob): - """ - Quickly scan nested directories to get a list of filenames that match the fn_glob string. - Modified from https://stackoverflow.com/a/59803793/2441026 - (faster than os.walk or glob methods, and allows filename matching in subdirectories). - - Parameters - ---------- - dir : str - full path to the input directory - - fn_glob : str - glob-style filename pattern - - Outputs - ------- - subfolders : list - list of strings of all nested subdirectories - - files : list - list of strings containing full paths to each file matching the filename pattern - """ - - subfolders, files = [], [] - - for f in os.scandir(dir): - if any(f.name.startswith(s) for s in ["__", "."]): - continue - if f.is_dir(): - subfolders.append(f.path) - if f.is_file(): - if fnmatch.fnmatch(f.name, fn_glob): - files.append(f.path) - - for dir in list(subfolders): - sf, f = _run_fast_scandir(dir, fn_glob) - subfolders.extend(sf) - files.extend(f) - - return subfolders, files - - # Need to post on intake's page to see if this would be a useful contribution... # https://github.com/intake/intake/blob/0.6.4/intake/source/utils.py#L216 def _pattern_to_glob(pattern): @@ -370,29 +362,7 @@ def __init__( "Please use the `data_source` argument to specify your dataset instead." ) - if isinstance(data_source, list): - # if data_source is a list pass that directly to _filelist - self._filelist = data_source - elif os.path.isdir(data_source): - # if data_source is a directory glob search the directory and assign to _filelist - data_source = os.path.join(data_source, "*") - self._filelist = glob.glob(data_source, **glob_kwargs) - elif isinstance(data_source, str): - if data_source.startswith("s3"): - # if the string is an s3 path put it in the _filelist without globbing - self._filelist = [data_source] - else: - # data_source is a globable string - self._filelist = glob.glob(data_source, **glob_kwargs) - else: - raise TypeError( - "data_source should be a list of files, a directory, the path to a file, " - "or a glob string." - ) - - # Remove any directories from the list (these get generated during recursive - # glob search) - self._filelist = [f for f in self._filelist if not os.path.isdir(f)] + self._filelist = _parse_source(data_source, glob_kwargs) # Create a dictionary of the products as read from the metadata product_dict = {} diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 67b29b598..147472593 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -50,48 +50,32 @@ def test_validate_source_str_not_a_dir_or_file(): "dir, fn_glob, expect", [ ( - "./icepyx/", + "./icepyx/**/", "is2*.py", - ( - sorted( - [ - "./icepyx/core", - "./icepyx/quest", - "./icepyx/quest/dataset_scripts", - "./icepyx/tests", - ] - ), - sorted( - [ - "./icepyx/core/is2ref.py", - "./icepyx/tests/is2class_query.py", - ] - ), + sorted( + [ + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ] ), ), ( - "./icepyx/core", + "./icepyx/core/", "is2*.py", - ([], ["./icepyx/core/is2ref.py"]), + ["./icepyx/core/is2ref.py"], ), ( - "./icepyx", + "./icepyx/", "bogus_glob", - ( - [ - "./icepyx/core", - "./icepyx/quest", - "./icepyx/quest/dataset_scripts", - "./icepyx/tests", - ], - [], - ), + [], ), ], ) -def test_check_run_fast_scandir(dir, fn_glob, expect): - (subfolders, files) = read._run_fast_scandir(dir, fn_glob) - assert (sorted(subfolders), sorted(files)) == expect +def test_parse_source(dir, fn_glob, expect): + filelist = read._parse_source(dir + fn_glob, glob_kwargs={"recursive": True}) + print(filelist) + print(expect) + assert (sorted(filelist)) == expect @pytest.mark.parametrize( From 615f7a244d902897762bb9843320c5fe2a90cc27 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Fri, 26 Jan 2024 15:42:42 -0500 Subject: [PATCH 02/22] add Path as accepted data_source type to read --- icepyx/core/read.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index e4289de56..70642234e 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -159,6 +159,8 @@ def _parse_source(data_source, glob_kwargs) -> list: List of granule (filenames) to be read in """ + from pathlib import Path + if isinstance(data_source, list): # if data_source is a list pass that directly to _filelist filelist = data_source @@ -166,7 +168,7 @@ def _parse_source(data_source, glob_kwargs) -> list: # if data_source is a directory glob search the directory and assign to _filelist data_source = os.path.join(data_source, "*") filelist = glob.glob(data_source, **glob_kwargs) - elif isinstance(data_source, str): + elif isinstance(data_source, str) or isinstance(data_source, Path): if data_source.startswith("s3"): # if the string is an s3 path put it in the _filelist without globbing filelist = [data_source] @@ -274,8 +276,8 @@ class Read(EarthdataAuthMixin): Parameters ---------- - data_source : string, List - A string or list which specifies the files to be read. + data_source : string, Path, List + A string, pathlib.Path object, or list which specifies the files to be read. The string can be either: 1) the path of a single file 2) the path to a directory or From 9ef66a9b9f507118695796d49ac399755c43a781 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Fri, 26 Jan 2024 15:45:15 -0500 Subject: [PATCH 03/22] remove _check_datasource function (for deprecated keyword inputs) --- icepyx/core/read.py | 65 --------------------------------------- icepyx/tests/test_read.py | 42 +++++++++---------------- 2 files changed, 15 insertions(+), 92 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 70642234e..949e28de0 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -100,55 +100,6 @@ def _get_track_type_str(grp_path) -> (str, str, str): return track_str, spot_dim_name, spot_var_name -# Dev note: function fully tested (except else, which don't know how to get to) -def _check_datasource(filepath): - """ - Determine if the input is from a local system or is an s3 bucket. - Then, validate the inputs (for those on the local system; s3 sources are not validated currently) - """ - - from pathlib import Path - - import fsspec - from fsspec.implementations.local import LocalFileSystem - - source_types = ["is2_local", "is2_s3"] - - if not isinstance(filepath, Path) and not isinstance(filepath, str): - raise TypeError("filepath must be a string or Path") - - fsmap = fsspec.get_mapper(str(filepath)) - output_fs = fsmap.fs - - if "s3" in output_fs.protocol: - return source_types[1] - elif isinstance(output_fs, LocalFileSystem): - assert _validate_source(filepath) - return source_types[0] - else: - raise ValueError("Could not confirm the datasource type.") - - """ - Could also use: os.path.splitext(f.name)[1].lower() to get file extension - - If ultimately want to handle mixed types, save the valid paths in a dict with "s3" or "local" as the keys and the list of the files as the values. - Then the dict can also contain a catalog key with a dict of catalogs for each of those types of inputs ("s3" or "local") - In general, the issue we'll run into with multiple files is going to be merging during the read in, - so it could be beneficial to not hide this too much and mandate users handle this intentionally outside the read in itself. - - this function was derived with some of the following resources, based on echopype - https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py - - https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=get_map#fsspec.spec.AbstractFileSystem.glob - - https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/implementations/local.html - - https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/convert/api.py#L380 - - https://echopype.readthedocs.io/en/stable/convert.html - """ - - def _parse_source(data_source, glob_kwargs) -> list: """ Parse the data_source input based on type. @@ -187,22 +138,6 @@ def _parse_source(data_source, glob_kwargs) -> list: return filelist - """ - - Check that the entered data source paths on the local file system are valid - - Currently, s3 data source paths are not validated. - """ - - # acceptable inputs (for now) are a single file or directory - # would ultimately like to make a Path (from pathlib import Path; isinstance(source, Path)) an option - # see https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py#L82 - assert isinstance(source, str), "You must enter your input as a string." - assert ( - os.path.isdir(source) is True or os.path.isfile(source) is True - ), "Your data source string is not a valid data source." - return True - # Need to post on intake's page to see if this would be a useful contribution... # https://github.com/intake/intake/blob/0.6.4/intake/source/utils.py#L216 diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 147472593..2047357f2 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -4,33 +4,21 @@ import icepyx.core.read as read -def test_check_datasource_type(): - ermesg = "filepath must be a string or Path" - with pytest.raises(TypeError, match=ermesg): - read._check_datasource(246) - - -@pytest.mark.parametrize( - "filepath, expect", - [ - ("./", "is2_local"), - ( - """s3://nsidc-cumulus-prod-protected/ATLAS/ - ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", - "is2_s3", - ), - ], -) -def test_check_datasource(filepath, expect): - source_type = read._check_datasource(filepath) - assert source_type == expect - - -# not sure what I could enter here would get to the else... -# def test_unknown_datasource_type(): -# ermesg = "Could not confirm the datasource type." -# with pytest.raises(ValueError, match=ermesg): -# read._check_datasource("") +# use as basis for test s3 function +# @pytest.mark.parametrize( +# "filepath, expect", +# [ +# ("./", "is2_local"), +# ( +# """s3://nsidc-cumulus-prod-protected/ATLAS/ +# ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", +# "is2_s3", +# ), +# ], +# ) +# def test_check_datasource(filepath, expect): +# source_type = read._check_datasource(filepath) +# assert source_type == expect def test_validate_source_str_given_as_list(): From a031a7a201575dc5df033dde72db6eb27d8b751e Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Fri, 26 Jan 2024 16:57:45 -0500 Subject: [PATCH 04/22] add check_s3_bucket function tests --- icepyx/tests/test_read.py | 17 --------------- icepyx/tests/test_validate_inputs.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 2047357f2..8ff200632 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -4,23 +4,6 @@ import icepyx.core.read as read -# use as basis for test s3 function -# @pytest.mark.parametrize( -# "filepath, expect", -# [ -# ("./", "is2_local"), -# ( -# """s3://nsidc-cumulus-prod-protected/ATLAS/ -# ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", -# "is2_s3", -# ), -# ], -# ) -# def test_check_datasource(filepath, expect): -# source_type = read._check_datasource(filepath) -# assert source_type == expect - - def test_validate_source_str_given_as_list(): ermesg = "You must enter your input as a string." with pytest.raises(AssertionError, match=ermesg): diff --git a/icepyx/tests/test_validate_inputs.py b/icepyx/tests/test_validate_inputs.py index 0b5f2f2eb..b776a7fc8 100644 --- a/icepyx/tests/test_validate_inputs.py +++ b/icepyx/tests/test_validate_inputs.py @@ -70,3 +70,35 @@ def test_tracks_valid(): val.tracks(1388) # check that warning message matches expected assert record[0].message.args[0] == expmsg + + +@pytest.mark.parametrize( + "filepath, expect", + [ + ("./", "./"), + ( + """s3://nsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", + """s3://nsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""", + ), + ], +) +def test_check_s3bucket(filepath, expect): + verified_path = val.check_s3bucket(filepath) + assert verified_path == expect + + +def test_wrong_s3bucket(): + filepath = """s3://notnsidc-cumulus-prod-protected/ATLAS/ + ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5""" + + expmsg = ( + "s3 data being read from outside the NSIDC data bucket. Icepyx can " + "read this data, but available data lists may not be accurate." + ) + + with pytest.warns(UserWarning) as record: + val.check_s3bucket(filepath) + + assert record[0].message.args[0] == expmsg From 465895e7ff77f8aded790fffba16f058e9dc8e18 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 16:22:32 -0500 Subject: [PATCH 05/22] remove pattern_to_glob function --- icepyx/core/read.py | 51 --------------------------------------- icepyx/tests/test_read.py | 2 -- 2 files changed, 53 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 949e28de0..7582dbd0a 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -139,57 +139,6 @@ def _parse_source(data_source, glob_kwargs) -> list: return filelist -# Need to post on intake's page to see if this would be a useful contribution... -# https://github.com/intake/intake/blob/0.6.4/intake/source/utils.py#L216 -def _pattern_to_glob(pattern): - """ - Adapted from intake.source.utils.path_to_glob to convert a path as pattern into a glob style path - that uses the pattern's indicated number of '?' instead of '*' where an int was specified. - - Returns pattern if pattern is not a string. - - Parameters - ---------- - pattern : str - Path as pattern optionally containing format_strings - - Returns - ------- - glob_path : str - Path with int format strings replaced with the proper number of '?' and '*' otherwise. - - Examples - -------- - >>> _pattern_to_glob('{year}/{month}/{day}.csv') - '*/*/*.csv' - >>> _pattern_to_glob('{year:4}/{month:2}/{day:2}.csv') - '????/??/??.csv' - >>> _pattern_to_glob('data/{year:4}{month:02}{day:02}.csv') - 'data/????????.csv' - >>> _pattern_to_glob('data/*.csv') - 'data/*.csv' - """ - from string import Formatter - - if not isinstance(pattern, str): - return pattern - - fmt = Formatter() - glob_path = "" - # prev_field_name = None - for literal_text, field_name, format_specs, _ in fmt.parse(format_string=pattern): - glob_path += literal_text - if field_name and (glob_path != "*"): - try: - glob_path += "?" * int(format_specs) - except ValueError: - glob_path += "*" - # alternatively, you could use bits=utils._get_parts_of_format_string(resolved_string, literal_texts, format_specs) - # and then use len(bits[i]) to get the length of each format_spec - # print(glob_path) - return glob_path - - def _confirm_proceed(): """ Ask the user if they wish to proceed with processing. If 'y', or 'yes', then continue. Any diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 8ff200632..439b2950c 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -44,8 +44,6 @@ def test_validate_source_str_not_a_dir_or_file(): ) def test_parse_source(dir, fn_glob, expect): filelist = read._parse_source(dir + fn_glob, glob_kwargs={"recursive": True}) - print(filelist) - print(expect) assert (sorted(filelist)) == expect From 0635d0c59cbc7a9ab2f77d8950c591e74b67f159 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 17:20:27 -0500 Subject: [PATCH 06/22] add more tests for all cases in _parse_source --- icepyx/tests/test_read.py | 55 +++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 439b2950c..fe8d6406f 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -18,11 +18,13 @@ def test_validate_source_str_not_a_dir_or_file(): @pytest.mark.parametrize( - "dir, fn_glob, expect", + "source, expect", [ - ( - "./icepyx/**/", - "is2*.py", + ( # check list input + [ + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ], sorted( [ "./icepyx/core/is2ref.py", @@ -30,20 +32,51 @@ def test_validate_source_str_not_a_dir_or_file(): ] ), ), + ( # check dir input + "./examples", + [ + "./examples/README.md", + ], + ), + ( # check filename string with glob pattern input + "./icepyx/**/is2*.py", + sorted( + [ + "./icepyx/core/is2ref.py", + "./icepyx/tests/is2class_query.py", + ] + ), + ), + ( # check filename string without glob pattern input + "./icepyx/core/is2ref.py", + [ + "./icepyx/core/is2ref.py", + ], + ), + ( # check s3 filename string + ( + "s3://nsidc-cumulus-prod-protected/ATLAS/" + "ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5" + ), + [ + ( + "s3://nsidc-cumulus-prod-protected/ATLAS/" + "ATL03/006/2019/11/30/ATL03_20191130221008_09930503_006_01.h5" + ), + ], + ), ( - "./icepyx/core/", - "is2*.py", + "./icepyx/core/is2*.py", ["./icepyx/core/is2ref.py"], ), - ( - "./icepyx/", - "bogus_glob", + ( # check bad input + "./icepyx/bogus_glob", [], ), ], ) -def test_parse_source(dir, fn_glob, expect): - filelist = read._parse_source(dir + fn_glob, glob_kwargs={"recursive": True}) +def test_parse_source(source, expect): + filelist = read._parse_source(source, glob_kwargs={"recursive": True}) assert (sorted(filelist)) == expect From 12e6f420b99019af4462e70de8df92cd17c59cfb Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 17:21:14 -0500 Subject: [PATCH 07/22] remove unneeded commented tests --- icepyx/tests/test_read.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index fe8d6406f..d6431cc67 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -100,18 +100,3 @@ def test_get_track_type_str( exp_spot_dim_name, exp_spot_var_name, ) - - -# Best way to test this may be by including a small sample file with the repo -# (which can be used for testing some of the catalog/read-in functions as well) -# def test_invalid_filename_pattern_in_file(): -# ermesg = "Your input filename does not match the specified pattern." -# default_pattern = Read("/path/to/valid/source/file")._filename_pattern -# with pytest.raises(AssertionError, match=ermesg): -# read._validate_source('/valid/filepath/with/non-default/filename/pattern.h5', default_pattern) - -# def test_invalid_filename_pattern_in_dir(): -# ermesg = "None of your filenames match the specified pattern." -# default_pattern = Read("/path/to/valid/dir/")._filename_pattern -# with pytest.raises(AssertionError, match=ermesg): -# read._validate_source('/valid/dirpath/with/non-default/filename/pattern.h5', default_pattern) From f75fca5ac6b4014f1f6ba537798bb9c552d2ce4e Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 17:42:24 -0500 Subject: [PATCH 08/22] update bad input type test --- icepyx/core/read.py | 5 +++-- icepyx/tests/test_read.py | 20 +++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 7582dbd0a..367843f3d 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -100,9 +100,9 @@ def _get_track_type_str(grp_path) -> (str, str, str): return track_str, spot_dim_name, spot_var_name -def _parse_source(data_source, glob_kwargs) -> list: +def _parse_source(data_source, glob_kwargs=None) -> list: """ - Parse the data_source input based on type. + Parse the user's data_source input based on type. Returns ------- @@ -113,6 +113,7 @@ def _parse_source(data_source, glob_kwargs) -> list: from pathlib import Path if isinstance(data_source, list): + assert [isinstance(f, (str, Path)) for f in data_source] # if data_source is a list pass that directly to _filelist filelist = data_source elif os.path.isdir(data_source): diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index d6431cc67..50182e37c 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -4,17 +4,15 @@ import icepyx.core.read as read -def test_validate_source_str_given_as_list(): - ermesg = "You must enter your input as a string." - with pytest.raises(AssertionError, match=ermesg): - read._validate_source(["/path/to/valid/ATL06_file.py"]) - - -def test_validate_source_str_not_a_dir_or_file(): - ermesg = "Your data source string is not a valid data source." - with pytest.raises(AssertionError, match=ermesg): - read._validate_source("./fake/dirpath") - read._validate_source("./fake_file.h5") +# note isdir will issue a TypeError if a tuple is passed +def test_parse_source_bad_input_type(): + ermesg = ( + "data_source should be a list of files, a directory, the path to a file, " + "or a glob string." + ) + with pytest.raises(TypeError, match=ermesg): + read._parse_source(150) + read._parse_source({"myfiles": "./my_valid_path/file.h5"}) @pytest.mark.parametrize( From f1fa1b32e4289a60a4d7c5895249ee0853c6b690 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 17:51:22 -0500 Subject: [PATCH 09/22] remove some other notes and old comments --- icepyx/core/read.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 367843f3d..c0e8fa8d8 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -635,8 +635,6 @@ def _build_dataset_template(self, file): It may be possible to expand this function to provide multiple templates. """ - # NOTE: use the hdf5 library to grab the attr for the product specifier - # can ultimately then use it to check against user specified one or merge strategies (or to return a list of ds) is2ds = xr.Dataset( coords=dict( From 6a1bcedb2c98d53088c94e6b2ef9469c8ba5293c Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 18:20:59 -0500 Subject: [PATCH 10/22] remove unneeded import --- icepyx/tests/test_read.py | 1 - 1 file changed, 1 deletion(-) diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index 50182e37c..c6fb50ea7 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -1,6 +1,5 @@ import pytest -from icepyx.core.read import Read import icepyx.core.read as read From 2ba4cd3c3b2b55cdb0e959b8f6114b7053b939d6 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 29 Jan 2024 19:06:22 -0500 Subject: [PATCH 11/22] shorten a few lines for linting --- icepyx/core/read.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index c0e8fa8d8..d40a41e16 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -36,7 +36,8 @@ def _make_np_datetime(df, keyword): Example ------- - >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, coords={"time_idx": [0]}) + >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, + >>> coords={"time_idx": [0]}) >>> _make_np_datetime(ds, "time") Dimensions: (time_idx: 1) @@ -48,7 +49,8 @@ def _make_np_datetime(df, keyword): """ if df[keyword].str.endswith("Z"): - # manually remove 'Z' from datetime to allow conversion to np.datetime64 object (support for timezones is deprecated and causes a seg fault) + # manually remove 'Z' from datetime to allow conversion to np.datetime64 object + # (support for timezones is deprecated and causes a seg fault) df.update({keyword: df[keyword].str[:-1].astype(np.datetime64)}) else: @@ -170,7 +172,8 @@ class Read(EarthdataAuthMixin): The List must be a list of strings, each of which is the path of a single file. glob_kwargs : dict, default {} - Additional arguments to be passed into the [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function + Additional arguments to be passed into the + [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function out_obj_type : object, default xarray.Dataset The desired format for the data to be read in. @@ -311,7 +314,8 @@ def __init__( def vars(self): """ Return the variables object associated with the data being read in. - This instance is generated from the source file or first file in a list of input files (when source is a directory). + This instance is generated from the source file or first file in a list of input files + (when source is a directory). See Also -------- From 420acb3c850f3ea1011af56683d3400e1337f3cd Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Tue, 30 Jan 2024 11:03:24 -0500 Subject: [PATCH 12/22] use ellipses in two-line docstring code example --- icepyx/core/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index d40a41e16..f2cbd53ad 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -37,7 +37,7 @@ def _make_np_datetime(df, keyword): Example ------- >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, - >>> coords={"time_idx": [0]}) + ... coords={"time_idx": [0]}) >>> _make_np_datetime(ds, "time") Dimensions: (time_idx: 1) From 487f5a500256ad1e4ae255c34ea1396025527f18 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 1 Feb 2024 13:55:59 -0500 Subject: [PATCH 13/22] remove previously removed modules from docs --- doc/source/user_guide/documentation/components.rst | 8 -------- doc/source/user_guide/documentation/query.rst | 1 - 2 files changed, 9 deletions(-) diff --git a/doc/source/user_guide/documentation/components.rst b/doc/source/user_guide/documentation/components.rst index dea41a970..93cfb0397 100644 --- a/doc/source/user_guide/documentation/components.rst +++ b/doc/source/user_guide/documentation/components.rst @@ -26,14 +26,6 @@ granules :members: :undoc-members: :show-inheritance: - -is2cat ------- - -.. automodule:: icepyx.core.is2cat - :members: - :undoc-members: - :show-inheritance: is2ref ------ diff --git a/doc/source/user_guide/documentation/query.rst b/doc/source/user_guide/documentation/query.rst index 168986548..9973c2dab 100644 --- a/doc/source/user_guide/documentation/query.rst +++ b/doc/source/user_guide/documentation/query.rst @@ -23,7 +23,6 @@ Attributes Query.cycles Query.dates Query.end_time - Query.file_vars Query.granules Query.order_vars Query.product From 86a4ad70db514c82275a29142492041ceac604e1 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 5 Feb 2024 22:04:14 -0500 Subject: [PATCH 14/22] fix longline errors except for deprecations --- icepyx/core/read.py | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index f2cbd53ad..03d9228a1 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -1,4 +1,3 @@ -import fnmatch import glob import os import sys @@ -208,7 +207,8 @@ class Read(EarthdataAuthMixin): Reading all files in a directory >>> ipx.Read('/path/to/data/') # doctest: +SKIP - Reading files that match a particular pattern (here, all .h5 files that start with `processed_ATL06_`). + Reading files that match a particular pattern + (here, all .h5 files that start with `processed_ATL06_`). >>> ipx.Read('/path/to/data/processed_ATL06_*.h5') # doctest: +SKIP Reading a specific list of files @@ -368,7 +368,8 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict): the second list contains the second portion of the group name, etc. "none" is used to fill in where paths are shorter than the longest path. wanted_dict : dict - Dictionary with variable names as keys and a list of group + variable paths containing those variables as values. + Dictionary with variable names as keys and a list of group + + variable paths containing those variables as values. Returns ------- @@ -461,7 +462,8 @@ def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict): ) ) - # for the subgoups where there is 1d delta time data, make sure that the cycle number is still a coordinate for merging + # for the subgoups where there is 1d delta time data, + # make sure that the cycle number is still a coordinate for merging try: ds = ds.assign_coords( { @@ -504,14 +506,16 @@ def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict): grp_path : str hdf5 group path read into ds wanted_dict : dict - Dictionary with variable names as keys and a list of group + variable paths containing those variables as values. + Dictionary with variable names as keys and a list of group + + variable paths containing those variables as values. Returns ------- Xarray Dataset with variables from the ds variable group added. """ - # Dev Goal: improve this type of iterating to minimize amount of looping required. Would a path handling library be useful here? + # Dev Goal: improve this type of iterating to minimize amount of looping required. + # Would a path handling library be useful here? grp_spec_vars = [ k for k, v in wanted_dict.items() if any(f"{grp_path}/{k}" in x for x in v) ] @@ -543,7 +547,8 @@ def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict): def load(self): """ - Create a single Xarray Dataset containing the data from one or more files and/or ground tracks. + Create a single Xarray Dataset containing the data from one or more + files and/or ground tracks. Uses icepyx's ICESat-2 data product awareness and Xarray's `combine_by_coords` function. All items in the wanted variables list will be loaded from the files into memory. @@ -657,7 +662,8 @@ def _read_single_grp(self, file, grp_path): ---------- file : str Full path to ICESat-2 data file. - Currently tested for locally downloaded files; untested but hopefully works for s3 stored cloud files. + Currently tested for locally downloaded files; + untested but hopefully works for s3 stored cloud files. grp_path : str Full string to a variable group. E.g. 'gt1l/land_ice_segments' @@ -677,23 +683,27 @@ def _read_single_grp(self, file, grp_path): def _build_single_file_dataset(self, file, groups_list): """ - Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url. + Create a single xarray dataset with all of the wanted variables/groups + from the wanted var list for a single data file/url. Parameters ---------- file : str Full path to ICESat-2 data file. - Currently tested for locally downloaded files; untested but hopefully works for s3 stored cloud files. + Currently tested for locally downloaded files; + untested but hopefully works for s3 stored cloud files. groups_list : list of strings List of full paths to data variables within the file. - e.g. ['orbit_info/sc_orient', 'gt1l/land_ice_segments/h_li', 'gt1l/land_ice_segments/latitude', 'gt1l/land_ice_segments/longitude'] + e.g. ['orbit_info/sc_orient', 'gt1l/land_ice_segments/h_li', + 'gt1l/land_ice_segments/latitude', 'gt1l/land_ice_segments/longitude'] Returns ------- Xarray Dataset """ - # DEVNOTE: if and elif does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds + # DEVNOTE: if and elif does not actually apply wanted variable list, + # and has not been tested for merging multiple files into one ds # if a gridded product # TODO: all products need to be tested, and quicklook products added or explicitly excluded # Level 3b, gridded (netcdf): ATL14, 15, 16, 17, 18, 19, 20, 21 @@ -720,13 +730,14 @@ def _build_single_file_dataset(self, file, groups_list): ) wanted_groups_set = set(wanted_groups) - # orbit_info is used automatically as the first group path so the info is available for the rest of the groups + # orbit_info is used automatically as the first group path + # so the info is available for the rest of the groups # wanted_groups_set.remove("orbit_info") wanted_groups_set.remove("ancillary_data") # Note: the sorting is critical for datasets with highly nested groups wanted_groups_list = ["ancillary_data"] + sorted(wanted_groups_set) - # returns the wanted groups as a list of lists with group path string elements separated + # returns wanted groups as a list of lists with group path string elements separated _, wanted_groups_tiered = Variables.parse_var_list( groups_list, tiered=True, tiered_vars=True ) @@ -751,14 +762,15 @@ def _build_single_file_dataset(self, file, groups_list): groups_list, tiered=False ) wanted_groups_set = set(wanted_groups) - # orbit_info is used automatically as the first group path so the info is available for the rest of the groups + # orbit_info is used automatically as the first group path + # so the info is available for the rest of the groups wanted_groups_set.remove("orbit_info") wanted_groups_set.remove("ancillary_data") # Note: the sorting is critical for datasets with highly nested groups wanted_groups_list = ["orbit_info", "ancillary_data"] + sorted( wanted_groups_set ) - # returns the wanted groups as a list of lists with group path string elements separated + # returns wanted groups as a list of lists with group path string elements separated _, wanted_groups_tiered = Variables.parse_var_list( groups_list, tiered=True, tiered_vars=True ) @@ -771,7 +783,8 @@ def _build_single_file_dataset(self, file, groups_list): is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict ) - # if there are any deeper nested variables, get those so they have actual coordinates and add them + # if there are any deeper nested variables, + # get those so they have actual coordinates and add them # this may apply to (at a minimum): ATL08 if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list): for grp_path2 in wanted_groups_list: From 734a55e79cbf3499d4e0e8769eb0bd133ea8f30f Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Mon, 5 Feb 2024 22:07:13 -0500 Subject: [PATCH 15/22] remove unused imports --- icepyx/tests/test_validate_inputs.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/icepyx/tests/test_validate_inputs.py b/icepyx/tests/test_validate_inputs.py index b776a7fc8..4d0ea0bd5 100644 --- a/icepyx/tests/test_validate_inputs.py +++ b/icepyx/tests/test_validate_inputs.py @@ -1,7 +1,4 @@ import pytest -import warnings -import datetime as dt -import numpy as np import icepyx.core.validate_inputs as val From 8fa662d4181758e10fff9d5bf45d45232940624c Mon Sep 17 00:00:00 2001 From: Rachel Wegener Date: Thu, 8 Feb 2024 15:23:11 +0000 Subject: [PATCH 16/22] move check 0 files to parse source --- icepyx/core/read.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 03d9228a1..398fee8bd 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -137,6 +137,13 @@ def _parse_source(data_source, glob_kwargs=None) -> list: # Remove any directories from the list (these get generated during recursive # glob search) filelist = [f for f in filelist if not os.path.isdir(f)] + + # Make sure a non-zero number of files were found + if len(filelist) == 0: + raise KeyError( + "No files found matching the specified `data_source`. Check your glob " + "string or file list." + ) return filelist @@ -261,6 +268,8 @@ def __init__( # If the path is an s3 path set the respective element of self.is_s3 to True if file_.startswith("s3"): self.is_s3[i] = True + # set up + if True in self.is_s3: auth = self.auth else: auth = None @@ -283,7 +292,7 @@ def __init__( ) _confirm_proceed() - # Raise warnings or errors for multiple products or products not matching the user-specified product + # Raise error if multiple products given all_products = list(set(product_dict.values())) if len(all_products) > 1: raise TypeError( @@ -291,14 +300,9 @@ def __init__( "Please provide a valid `data_source` parameter indicating files of a single " "product" ) - elif len(all_products) == 0: - raise TypeError( - "No files found matching the specified `data_source`. Check your glob " - "string or file list." - ) - else: - # Assign the identified product to the property - self._product = all_products[0] + + # Assign the identified product to the property + self._product = all_products[0] if out_obj_type is not None: print( From a9a46c5022ffe92d711b0a9ab9ee3e49e0c3fcdf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 Feb 2024 15:23:30 +0000 Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- icepyx/core/read.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 398fee8bd..80caaf6d8 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -137,13 +137,13 @@ def _parse_source(data_source, glob_kwargs=None) -> list: # Remove any directories from the list (these get generated during recursive # glob search) filelist = [f for f in filelist if not os.path.isdir(f)] - + # Make sure a non-zero number of files were found if len(filelist) == 0: raise KeyError( - "No files found matching the specified `data_source`. Check your glob " - "string or file list." - ) + "No files found matching the specified `data_source`. Check your glob " + "string or file list." + ) return filelist @@ -268,7 +268,7 @@ def __init__( # If the path is an s3 path set the respective element of self.is_s3 to True if file_.startswith("s3"): self.is_s3[i] = True - # set up + # set up if True in self.is_s3: auth = self.auth else: @@ -300,7 +300,7 @@ def __init__( "Please provide a valid `data_source` parameter indicating files of a single " "product" ) - + # Assign the identified product to the property self._product = all_products[0] From 69717ad3c960f82f5378128d3a55a73466020d5f Mon Sep 17 00:00:00 2001 From: Rachel Wegener Date: Thu, 8 Feb 2024 15:26:22 +0000 Subject: [PATCH 18/22] remove a line I didn't mean to commit --- icepyx/core/read.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 398fee8bd..41293dcf6 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -268,8 +268,6 @@ def __init__( # If the path is an s3 path set the respective element of self.is_s3 to True if file_.startswith("s3"): self.is_s3[i] = True - # set up - if True in self.is_s3: auth = self.auth else: auth = None From c7cc03ed627fe22a37d51abeb8878cf51ccd634a Mon Sep 17 00:00:00 2001 From: Rachel Wegener Date: Thu, 8 Feb 2024 15:49:53 +0000 Subject: [PATCH 19/22] fix syntax error found while testing --- icepyx/core/is2ref.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/icepyx/core/is2ref.py b/icepyx/core/is2ref.py index c51c631be..66ceaec1c 100644 --- a/icepyx/core/is2ref.py +++ b/icepyx/core/is2ref.py @@ -378,8 +378,10 @@ def extract_product(filepath, auth=None): # ATL14 saves the short_name as an array ['ATL14'] product = product[0] product = _validate_product(product) - except KeyError: - raise "Unable to parse the product name from file metadata" + except KeyError as e: + raise Exception( + "Unable to parse the product name from file metadata" + ).with_traceback(e.__traceback__) # Close the file reader f.close() @@ -421,8 +423,10 @@ def extract_version(filepath, auth=None): if isinstance(version, bytes): version = version.decode() - except KeyError: - raise "Unable to parse the version from file metadata" + except KeyError as e: + raise Exception( + "Unable to parse the version from file metadata" + ).with_traceback(e.__traceback__) # Close the file reader f.close() From a64c5f99e4ac2569d75ecf8cd14a483b60da55ac Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 8 Feb 2024 15:53:28 +0000 Subject: [PATCH 20/22] GitHub action UML generation auto-update --- doc/source/user_guide/documentation/classes_dev_uml.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/documentation/classes_dev_uml.svg b/doc/source/user_guide/documentation/classes_dev_uml.svg index 09c112f5c..84fb2a450 100644 --- a/doc/source/user_guide/documentation/classes_dev_uml.svg +++ b/doc/source/user_guide/documentation/classes_dev_uml.svg @@ -235,7 +235,7 @@ Read -_filelist : list +_filelist _out_obj : Dataset _product _read_vars From cae97016e1ce5ba6796f5944e8c5ce146e975284 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 8 Feb 2024 14:56:18 -0500 Subject: [PATCH 21/22] fix no files test --- icepyx/tests/test_read.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/icepyx/tests/test_read.py b/icepyx/tests/test_read.py index c6fb50ea7..20807c410 100644 --- a/icepyx/tests/test_read.py +++ b/icepyx/tests/test_read.py @@ -14,6 +14,15 @@ def test_parse_source_bad_input_type(): read._parse_source({"myfiles": "./my_valid_path/file.h5"}) +def test_parse_source_no_files(): + ermesg = ( + "No files found matching the specified `data_source`. Check your glob " + "string or file list." + ) + with pytest.raises(KeyError, match=ermesg): + read._parse_source("./icepyx/bogus_glob") + + @pytest.mark.parametrize( "source, expect", [ @@ -66,10 +75,6 @@ def test_parse_source_bad_input_type(): "./icepyx/core/is2*.py", ["./icepyx/core/is2ref.py"], ), - ( # check bad input - "./icepyx/bogus_glob", - [], - ), ], ) def test_parse_source(source, expect): From f8b7f0b592e0e33ece2e40cb2d699a4fabce826b Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 8 Feb 2024 15:57:00 -0500 Subject: [PATCH 22/22] update default glob_kwargs to dict --- icepyx/core/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icepyx/core/read.py b/icepyx/core/read.py index ed8871637..8cc5afd21 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -101,7 +101,7 @@ def _get_track_type_str(grp_path) -> (str, str, str): return track_str, spot_dim_name, spot_var_name -def _parse_source(data_source, glob_kwargs=None) -> list: +def _parse_source(data_source, glob_kwargs={}) -> list: """ Parse the user's data_source input based on type.