icesat2py · JessicaS11 · Feb 8, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/icepyx/core/read.py b/icepyx/core/read.py
@@ -36,7 +36,8 @@
 
     Example
     -------
-    >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])}, coords={"time_idx": [0]})
+    >>> ds = xr.Dataset({"time": ("time_idx", [b'2019-01-11T05:26:31.323722Z'])},
+    ...                  coords={"time_idx": [0]})
     >>> _make_np_datetime(ds, "time")
     <xarray.Dataset>
     Dimensions:   (time_idx: 1)
@@ -48,7 +49,8 @@
     """
 
     if df[keyword].str.endswith("Z"):
-        # manually remove 'Z' from datetime to allow conversion to np.datetime64 object (support for timezones is deprecated and causes a seg fault)
+        # manually remove 'Z' from datetime to allow conversion to np.datetime64 object
+        # (support for timezones is deprecated and causes a seg fault)
         df.update({keyword: df[keyword].str[:-1].astype(np.datetime64)})
 
     else:
@@ -100,165 +102,44 @@
     return track_str, spot_dim_name, spot_var_name
 
 
-# Dev note: function fully tested (except else, which don't know how to get to)
-def _check_datasource(filepath):
+def _parse_source(data_source, glob_kwargs=None) -> list:
     """
-    Determine if the input is from a local system or is an s3 bucket.
-    Then, validate the inputs (for those on the local system; s3 sources are not validated currently)
-    """
-
-    from pathlib import Path
-
-    import fsspec
-    from fsspec.implementations.local import LocalFileSystem
-
-    source_types = ["is2_local", "is2_s3"]
-
-    if not isinstance(filepath, Path) and not isinstance(filepath, str):
-        raise TypeError("filepath must be a string or Path")
-
-    fsmap = fsspec.get_mapper(str(filepath))
-    output_fs = fsmap.fs
-
-    if "s3" in output_fs.protocol:
-        return source_types[1]
-    elif isinstance(output_fs, LocalFileSystem):
-        assert _validate_source(filepath)
-        return source_types[0]
-    else:
-        raise ValueError("Could not confirm the datasource type.")
-
-    """
-    Could also use: os.path.splitext(f.name)[1].lower() to get file extension
-
-    If ultimately want to handle mixed types, save the valid paths in a dict with "s3" or "local" as the keys and the list of the files as the values.
-    Then the dict can also contain a catalog key with a dict of catalogs for each of those types of inputs ("s3" or "local")
-    In general, the issue we'll run into with multiple files is going to be merging during the read in,
-    so it could be beneficial to not hide this too much and mandate users handle this intentionally outside the read in itself.
-
-    this function was derived with some of the following resources, based on echopype
-    https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py
-
-    https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=get_map#fsspec.spec.AbstractFileSystem.glob
-
-    https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/implementations/local.html
-
-    https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/convert/api.py#L380
-
-    https://echopype.readthedocs.io/en/stable/convert.html
-    """
-
-
-# Dev note: function fully tested as currently written
-def _validate_source(source):
-    """
-    Check that the entered data source paths on the local file system are valid
-
-    Currently, s3 data source paths are not validated.
-    """
-
-    # acceptable inputs (for now) are a single file or directory
-    # would ultimately like to make a Path (from pathlib import Path; isinstance(source, Path)) an option
-    # see https://github.com/OSOceanAcoustics/echopype/blob/ab5128fb8580f135d875580f0469e5fba3193b84/echopype/utils/io.py#L82
-    assert isinstance(source, str), "You must enter your input as a string."
-    assert (
-        os.path.isdir(source) is True or os.path.isfile(source) is True
-    ), "Your data source string is not a valid data source."
-    return True
-
-
-# Dev Note: function is tested (at least loosely)
-def _run_fast_scandir(dir, fn_glob):
-    """
-    Quickly scan nested directories to get a list of filenames that match the fn_glob string.
-    Modified from https://stackoverflow.com/a/59803793/2441026
-    (faster than os.walk or glob methods, and allows filename matching in subdirectories).
-
-    Parameters
-    ----------
-    dir : str
-        full path to the input directory
-
-    fn_glob : str
-        glob-style filename pattern
+    Parse the user's data_source input based on type.
 
-    Outputs
+    Returns
     -------
-    subfolders : list
-        list of strings of all nested subdirectories
-
-    files : list
-        list of strings containing full paths to each file matching the filename pattern
+    filelist : list of str
+        List of granule (filenames) to be read in
     """
 
-    subfolders, files = [], []
-
-    for f in os.scandir(dir):
-        if any(f.name.startswith(s) for s in ["__", "."]):
-            continue
-        if f.is_dir():
-            subfolders.append(f.path)
-        if f.is_file():
-            if fnmatch.fnmatch(f.name, fn_glob):
-                files.append(f.path)
-
-    for dir in list(subfolders):
-        sf, f = _run_fast_scandir(dir, fn_glob)
-        subfolders.extend(sf)
-        files.extend(f)
-
-    return subfolders, files
-
-
-# Need to post on intake's page to see if this would be a useful contribution...
-# https://github.com/intake/intake/blob/0.6.4/intake/source/utils.py#L216
-def _pattern_to_glob(pattern):
-    """
-    Adapted from intake.source.utils.path_to_glob to convert a path as pattern into a glob style path
-    that uses the pattern's indicated number of '?' instead of '*' where an int was specified.
-
-    Returns pattern if pattern is not a string.
-
-    Parameters
-    ----------
-    pattern : str
-        Path as pattern optionally containing format_strings
-
-    Returns
-    -------
-    glob_path : str
-        Path with int format strings replaced with the proper number of '?' and '*' otherwise.
+    from pathlib import Path
 
-    Examples
-    --------
-    >>> _pattern_to_glob('{year}/{month}/{day}.csv')
-    '*/*/*.csv'
-    >>> _pattern_to_glob('{year:4}/{month:2}/{day:2}.csv')
-    '????/??/??.csv'
-    >>> _pattern_to_glob('data/{year:4}{month:02}{day:02}.csv')
-    'data/????????.csv'
-    >>> _pattern_to_glob('data/*.csv')
-    'data/*.csv'
-    """
-    from string import Formatter
+    if isinstance(data_source, list):
+        assert [isinstance(f, (str, Path)) for f in data_source]
+        # if data_source is a list pass that directly to _filelist
+        filelist = data_source
+    elif os.path.isdir(data_source):
+        # if data_source is a directory glob search the directory and assign to _filelist
+        data_source = os.path.join(data_source, "*")
+        filelist = glob.glob(data_source, **glob_kwargs)
+    elif isinstance(data_source, str) or isinstance(data_source, Path):
+        if data_source.startswith("s3"):
+            # if the string is an s3 path put it in the _filelist without globbing
+            filelist = [data_source]
+        else:
+            # data_source is a globable string
+            filelist = glob.glob(data_source, **glob_kwargs)
+    else:
+        raise TypeError(
+            "data_source should be a list of files, a directory, the path to a file, "
+            "or a glob string."
+        )
 
-    if not isinstance(pattern, str):
-        return pattern
+    # Remove any directories from the list (these get generated during recursive
+    # glob search)
+    filelist = [f for f in filelist if not os.path.isdir(f)]
 
-    fmt = Formatter()
-    glob_path = ""
-    # prev_field_name = None
-    for literal_text, field_name, format_specs, _ in fmt.parse(format_string=pattern):
-        glob_path += literal_text
-        if field_name and (glob_path != "*"):
-            try:
-                glob_path += "?" * int(format_specs)
-            except ValueError:
-                glob_path += "*"
-                # alternatively, you could use bits=utils._get_parts_of_format_string(resolved_string, literal_texts, format_specs)
-                # and then use len(bits[i]) to get the length of each format_spec
-    # print(glob_path)
-    return glob_path
+    return filelist
 
 
 def _confirm_proceed():
@@ -282,16 +163,17 @@
 
     Parameters
     ----------
-    data_source : string, List
-        A string or list which specifies the files to be read.
+    data_source : string, Path, List
+        A string, pathlib.Path object, or list which specifies the files to be read.
         The string can be either:
         1) the path of a single file
         2) the path to a directory or
         3) a [glob string](https://docs.python.org/3/library/glob.html).
         The List must be a list of strings, each of which is the path of a single file.
 
     glob_kwargs : dict, default {}
-        Additional arguments to be passed into the [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function
+        Additional arguments to be passed into the
+        [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function
 
     out_obj_type : object, default xarray.Dataset
         The desired format for the data to be read in.
@@ -370,29 +252,7 @@
                 "Please use the `data_source` argument to specify your dataset instead."
             )
 
-        if isinstance(data_source, list):
-            # if data_source is a list pass that directly to _filelist
-            self._filelist = data_source
-        elif os.path.isdir(data_source):
-            # if data_source is a directory glob search the directory and assign to _filelist
-            data_source = os.path.join(data_source, "*")
-            self._filelist = glob.glob(data_source, **glob_kwargs)
-        elif isinstance(data_source, str):
-            if data_source.startswith("s3"):
-                # if the string is an s3 path put it in the _filelist without globbing
-                self._filelist = [data_source]
-            else:
-                # data_source is a globable string
-                self._filelist = glob.glob(data_source, **glob_kwargs)
-        else:
-            raise TypeError(
-                "data_source should be a list of files, a directory, the path to a file, "
-                "or a glob string."
-            )
-
-        # Remove any directories from the list (these get generated during recursive
-        # glob search)
-        self._filelist = [f for f in self._filelist if not os.path.isdir(f)]
+        self._filelist = _parse_source(data_source, glob_kwargs)
 
         # Create a dictionary of the products as read from the metadata
         product_dict = {}
@@ -454,7 +314,8 @@
     def vars(self):
         """
         Return the variables object associated with the data being read in.
-        This instance is generated from the source file or first file in a list of input files (when source is a directory).
+        This instance is generated from the source file or first file in a list of input files
+        (when source is a directory).
 
         See Also
         --------
@@ -778,8 +639,6 @@
 
         It may be possible to expand this function to provide multiple templates.
         """
-        # NOTE: use the hdf5 library to grab the attr for the product specifier
-        # can ultimately then use it to check against user specified one or merge strategies (or to return a list of ds)
 
         is2ds = xr.Dataset(
             coords=dict(