From e591d83ed92d67ca368f10a3af709d5c9ff3cb80 Mon Sep 17 00:00:00 2001 From: Rachel Wegener <35503632+rwegener2@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:02:15 -0400 Subject: [PATCH] update Read input arguments (#444) * add filelist and product properties to Read object * deprecate filename_pattern and product class Read inputs * transition to data_source input as a string (including glob string) or list * update tutorial with changes and user guidance for using glob --------- Co-authored-by: Jessica Scheick --- .../example_notebooks/IS2_data_read-in.ipynb | 182 +++++++++++----- .../documentation/classes_dev_uml.svg | 122 +++++------ .../documentation/classes_user_uml.svg | 21 +- doc/source/user_guide/documentation/read.rst | 2 + icepyx/core/is2ref.py | 5 +- icepyx/core/read.py | 200 +++++++++++++----- icepyx/tests/test_is2ref.py | 4 +- 7 files changed, 353 insertions(+), 183 deletions(-) diff --git a/doc/source/example_notebooks/IS2_data_read-in.ipynb b/doc/source/example_notebooks/IS2_data_read-in.ipynb index 115c63044..9bbac368b 100644 --- a/doc/source/example_notebooks/IS2_data_read-in.ipynb +++ b/doc/source/example_notebooks/IS2_data_read-in.ipynb @@ -63,9 +63,8 @@ "metadata": {}, "outputs": [], "source": [ - "path_root = '/full/path/to/your/data/'\n", - "pattern = \"processed_ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"\n", - "reader = ipx.Read(path_root, \"ATL06\", pattern) # or ipx.Read(filepath, \"ATLXX\") if your filenames match the default pattern" + "path_root = '/full/path/to/your/ATL06_data/'\n", + "reader = ipx.Read(path_root)" ] }, { @@ -111,10 +110,9 @@ "\n", "Reading in ICESat-2 data with icepyx happens in a few simple steps:\n", "1. Let icepyx know where to find your data (this might be local files or urls to data in cloud storage)\n", - "2. Tell icepyx how to interpret the filename format\n", - "3. Create an icepyx `Read` object\n", - "4. Make a list of the variables you want to read in (does not apply for gridded products)\n", - "5. Load your data into memory (or read it in lazily, if you're using Dask)\n", + "2. Create an icepyx `Read` object\n", + "3. Make a list of the variables you want to read in (does not apply for gridded products)\n", + "4. Load your data into memory (or read it in lazily, if you're using Dask)\n", "\n", "We go through each of these steps in more detail in this notebook." ] @@ -168,21 +166,18 @@ { "cell_type": "markdown", "id": "e8da42c1", - "metadata": {}, + "metadata": { + "user_expressions": [] + }, "source": [ "### Step 1: Set data source path\n", "\n", "Provide a full path to the data to be read in (i.e. opened).\n", "Currently accepted inputs are:\n", - "* a directory\n", - "* a single file\n", - "\n", - "All files to be read in *must* have a consistent filename pattern.\n", - "If a directory is supplied as the data source, all files in any subdirectories that match the filename pattern will be included.\n", - "\n", - "S3 bucket data access is currently under development, and requires you are registered with NSIDC as a beta tester for cloud-based ICESat-2 data.\n", - "icepyx is working to ensure a smooth transition to working with remote files.\n", - "We'd love your help exploring and testing these features as they become available!" + "* a string path to directory - all files from the directory will be opened\n", + "* a string path to single file - one file will be opened\n", + "* a list of filepaths - all files in the list will be opened\n", + "* a glob string (see [glob](https://docs.python.org/3/library/glob.html)) - any files matching the glob pattern will be opened" ] }, { @@ -208,86 +203,147 @@ { "cell_type": "code", "execution_count": null, - "id": "e683ebf7", + "id": "fac636c2-e0eb-4e08-adaa-8f47623e46a1", "metadata": {}, "outputs": [], "source": [ - "# urlpath = 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/004/2019/11/30/ATL03_20191130221008_09930503_004_01.h5'" + "# list_of_files = ['/my/data/ATL06/processed_ATL06_20190226005526_09100205_006_02.h5', \n", + "# '/my/other/data/ATL06/processed_ATL06_20191202102922_10160505_006_01.h5']" ] }, { "cell_type": "markdown", - "id": "92743496", + "id": "ba3ebeb0-3091-4712-b0f7-559ddb95ca5a", "metadata": { "user_expressions": [] }, "source": [ - "### Step 2: Create a filename pattern for your data files\n", + "#### Glob Strings\n", + "\n", + "[glob](https://docs.python.org/3/library/glob.html) is a Python library which allows users to list files in their file systems whose paths match a given pattern. Icepyx uses the glob library to give users greater flexibility over their input file lists.\n", + "\n", + "glob works using `*` and `?` as wildcard characters, where `*` matches any number of characters and `?` matches a single character. For example:\n", "\n", - "Files provided by NSIDC typically match the format `\"ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"` where the parameters in curly brackets indicate a parameter name (left of the colon) and character length or format (right of the colon).\n", - "Some of this information is used during data opening to help correctly read and label the data within the data structure, particularly when multiple files are opened simultaneously.\n", + "* `/this/path/*.h5`: refers to all `.h5` files in the `/this/path` folder (Example matches: \"/this/path/processed_ATL03_20191130221008_09930503_006_01.h5\" or \"/this/path/myfavoriteicsat-2file.h5\")\n", + "* `/this/path/*ATL07*.h5`: refers to all `.h5` files in the `/this/path` folder that have ATL07 in the filename. (Example matches: \"/this/path/ATL07-02_20221012220720_03391701_005_01.h5\" or \"/this/path/processed_ATL07.h5\")\n", + "* `/this/path/ATL??/*.h5`: refers to all `.h5` files that are in a subfolder of `/this/path` and a subdirectory of `ATL` followed by any 2 characters (Example matches: \"/this/path/ATL03/processed_ATL03_20191130221008_09930503_006_01.h5\", \"/this/path/ATL06/myfile.h5\")\n", "\n", - "By default, icepyx will assume your filenames follow the default format.\n", - "However, you can easily read in other ICESat-2 data files by supplying your own filename pattern.\n", - "For instance, `pattern=\"ATL{product:2}-{datetime:%Y%m%d%H%M%S}-Sample.h5\"`. A few example patterns are provided below." + "See the glob documentation or other online explainer tutorials for more in depth explanation, or advanced glob paths such as character classes and ranges." ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7318abd0", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "20286c76-5632-4420-b2c9-a5a6b1952672", + "metadata": { + "user_expressions": [] + }, + "source": [ + "#### Recursive Directory Search" + ] + }, + { + "cell_type": "markdown", + "id": "632bd1ce-2397-4707-a63f-9d5d2fc02fbc", + "metadata": { + "user_expressions": [] + }, + "source": [ + "glob will not by default search all of the subdirectories for matching filepaths, but it has the ability to do so.\n", + "\n", + "If you would like to search recursively, you can achieve this by either:\n", + "1. passing the `recursive` argument into `glob_kwargs` and including `\\**\\` in your filepath\n", + "2. using glob directly to create a list of filepaths\n", + "\n", + "Each of these two methods are shown below." + ] + }, + { + "cell_type": "markdown", + "id": "da0cacd8-9ddc-4c31-86b6-167d850b989e", + "metadata": { + "user_expressions": [] + }, "source": [ - "# pattern = 'ATL06-{datetime:%Y%m%d%H%M%S}-Sample.h5'\n", - "# pattern = 'ATL{product:2}-{datetime:%Y%m%d%H%M%S}-Sample.h5'" + "Method 1: passing the `recursive` argument into `glob_kwargs`" ] }, { "cell_type": "code", "execution_count": null, - "id": "f43e8664", + "id": "e276b876-9ec7-4991-8520-05c97824b896", "metadata": {}, "outputs": [], "source": [ - "# pattern = \"ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"" + "ipx.Read('/path/to/**/folder', glob_kwargs={'recursive': True})" + ] + }, + { + "cell_type": "markdown", + "id": "f5a1e85e-fc4a-405f-9710-0cb61b827f2c", + "metadata": { + "user_expressions": [] + }, + "source": [ + "You can use `glob_kwargs` for any additional argument to Python's builtin `glob.glob` that you would like to pass in via icepyx." + ] + }, + { + "cell_type": "markdown", + "id": "76de9539-710c-49f6-9e9e-238849382c33", + "metadata": { + "user_expressions": [] + }, + "source": [ + "Method 2: using glob directly to create a list of filepaths" ] }, { "cell_type": "code", "execution_count": null, - "id": "992a77fb", + "id": "be79b0dd-efcf-4d50-bdb0-8e3ae8e8e38c", "metadata": {}, "outputs": [], "source": [ - "# grid_pattern = \"ATL{product:2}_GL_0311_{res:3}m_{version:3}_{revision:2}.nc\"" + "import glob" ] }, { "cell_type": "code", "execution_count": null, - "id": "6aec1a70", - "metadata": {}, + "id": "5d088571-496d-479a-9fb7-833ed7e98676", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "pattern = \"processed_ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"" + "list_of_files = glob.glob('/path/to/**/folder', recursive=True)\n", + "ipx.Read(list_of_files)" ] }, { "cell_type": "markdown", - "id": "4275b04c", + "id": "08df2874-7c54-4670-8f37-9135ea296ff5", "metadata": { "user_expressions": [] }, "source": [ - "### Step 3: Create an icepyx read object\n", + "```{admonition} Read Module Update\n", + "Previously, icepyx required two additional conditions: 1) a `product` argument and 2) that your files either matched the default `filename_pattern` or that the user provided their own `filename_pattern`. These two requirements have been removed. `product` is now read directly from the file metadata (the root group's `short_name` attribute). Flexibility to specify multiple files via the `filename_pattern` has been replaced with the [glob string](https://docs.python.org/3/library/glob.html) feature, and by allowing a list of filepaths as an argument.\n", "\n", - "The `Read` object has two required inputs:\n", - "- `path` = a string with the full file path or full directory path to your hdf5 (.h5) format files.\n", - "- `product` = the data product you're working with, also known as the \"short name\".\n", + "The `product` and `filename_pattern` arguments have been maintained for backwards compatibility, but will be fully removed in icepyx version 1.0.0.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "4275b04c", + "metadata": { + "user_expressions": [] + }, + "source": [ + "### Step 2: Create an icepyx read object\n", "\n", - "The `Read` object also accepts the optional keyword input:\n", - "- `pattern` = a formatted string indicating the filename pattern required for Intake's path_as_pattern argument." + "Using the `data_source` described in Step 1, we can create our Read object." ] }, { @@ -299,7 +355,17 @@ }, "outputs": [], "source": [ - "reader = ipx.Read(data_source=path_root, product=\"ATL06\", filename_pattern=pattern) # or ipx.Read(filepath, \"ATLXX\") if your filenames match the default pattern" + "reader = ipx.Read(data_source=path_root)" + ] + }, + { + "cell_type": "markdown", + "id": "7b2acfdb-75eb-4c64-b583-2ab19326aaee", + "metadata": { + "user_expressions": [] + }, + "source": [ + "The Read object now contains the list of matching files that will eventually be loaded into Python. You can inspect its properties, such as the files that were located or the identified product, directly on the Read object." ] }, { @@ -309,7 +375,17 @@ "metadata": {}, "outputs": [], "source": [ - "reader._filelist" + "reader.filelist" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7455ee3f-f9ab-486e-b4c7-2fa2314d4084", + "metadata": {}, + "outputs": [], + "source": [ + "reader.product" ] }, { @@ -319,7 +395,7 @@ "user_expressions": [] }, "source": [ - "### Step 4: Specify variables to be read in\n", + "### Step 3: Specify variables to be read in\n", "\n", "To load your data into memory or prepare it for analysis, icepyx needs to know which variables you'd like to read in.\n", "If you've used icepyx to download data from NSIDC with variable subsetting (which is the default), then you may already be familiar with the icepyx `Variables` module and how to create and modify lists of variables.\n", @@ -426,7 +502,7 @@ "user_expressions": [] }, "source": [ - "### Step 5: Loading your data\n", + "### Step 4: Loading your data\n", "\n", "Now that you've set up all the options, you're ready to read your ICESat-2 data into memory!" ] @@ -541,9 +617,9 @@ ], "metadata": { "kernelspec": { - "display_name": "general", + "display_name": "icepyx-dev", "language": "python", - "name": "general" + "name": "icepyx-dev" }, "language_info": { "codemirror_mode": { diff --git a/doc/source/user_guide/documentation/classes_dev_uml.svg b/doc/source/user_guide/documentation/classes_dev_uml.svg index 34e13b41c..0cd08c9e9 100644 --- a/doc/source/user_guide/documentation/classes_dev_uml.svg +++ b/doc/source/user_guide/documentation/classes_dev_uml.svg @@ -4,11 +4,11 @@ - + classes_dev_uml - + icepyx.core.auth.AuthenticationError @@ -139,38 +139,38 @@ icepyx.core.icesat2data.Icesat2Data - -Icesat2Data - - -__init__() + +Icesat2Data + + +__init__() icepyx.core.exceptions.NsidcQueryError - -NsidcQueryError - -errmsg -msgtxt : str - -__init__(errmsg, msgtxt) -__str__() + +NsidcQueryError + +errmsg +msgtxt : str + +__init__(errmsg, msgtxt) +__str__() icepyx.core.exceptions.QueryError - -QueryError - - - + +QueryError + + + icepyx.core.exceptions.NsidcQueryError->icepyx.core.exceptions.QueryError - - + + @@ -235,24 +235,24 @@ icepyx.core.read.Read - -Read - -_filelist : NoneType, list -_out_obj : Dataset -_pattern : str -_prod : str -_read_vars -_source_type : str -data_source -vars - -__init__(data_source, product, filename_pattern, catalog, out_obj_type) -_add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict) -_build_dataset_template(file) -_build_single_file_dataset(file, groups_list) -_check_source_for_pattern(source, filename_pattern) -_combine_nested_vars(is2ds, ds, grp_path, wanted_dict) + +Read + +_filelist : NoneType, list +_out_obj : Dataset +_product : NoneType, str +_read_vars +filelist +product +vars + +__init__(data_source, product, filename_pattern, catalog, glob_kwargs, out_obj_type) +_add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict) +_build_dataset_template(file) +_build_single_file_dataset(file, groups_list) +_check_source_for_pattern(source, filename_pattern) +_combine_nested_vars(is2ds, ds, grp_path, wanted_dict) +_extract_product(filepath) _read_single_grp(file, grp_path) load() @@ -366,30 +366,30 @@ icepyx.core.variables.Variables->icepyx.core.read.Read - - -_read_vars + + +_read_vars icepyx.core.visualization.Visualize - -Visualize - -bbox : list -cycles : NoneType -date_range : NoneType -product : NoneType, str -tracks : NoneType - -__init__(query_obj, product, spatial_extent, date_range, cycles, tracks) -generate_OA_parameters(): list -grid_bbox(binsize): list -make_request(base_url, payload) -parallel_request_OA(): da.array -query_icesat2_filelist(): tuple -request_OA_data(paras): da.array -viz_elevation(): (hv.DynamicMap, hv.Layout) + +Visualize + +bbox : list +cycles : NoneType +date_range : NoneType +product : NoneType, str +tracks : NoneType + +__init__(query_obj, product, spatial_extent, date_range, cycles, tracks) +generate_OA_parameters(): list +grid_bbox(binsize): list +make_request(base_url, payload) +parallel_request_OA(): da.array +query_icesat2_filelist(): tuple +request_OA_data(paras): da.array +viz_elevation(): (hv.DynamicMap, hv.Layout) diff --git a/doc/source/user_guide/documentation/classes_user_uml.svg b/doc/source/user_guide/documentation/classes_user_uml.svg index 640f76815..a9c116469 100644 --- a/doc/source/user_guide/documentation/classes_user_uml.svg +++ b/doc/source/user_guide/documentation/classes_user_uml.svg @@ -201,13 +201,14 @@ icepyx.core.read.Read - -Read - -data_source -vars - -load() + +Read + +filelist +product +vars + +load() @@ -300,9 +301,9 @@ icepyx.core.variables.Variables->icepyx.core.read.Read - - -_read_vars + + +_read_vars diff --git a/doc/source/user_guide/documentation/read.rst b/doc/source/user_guide/documentation/read.rst index a5beedf4e..68da03b1d 100644 --- a/doc/source/user_guide/documentation/read.rst +++ b/doc/source/user_guide/documentation/read.rst @@ -19,6 +19,8 @@ Attributes .. autosummary:: :toctree: ../../_icepyx/ + Read.filelist + Read.product Read.vars diff --git a/icepyx/core/is2ref.py b/icepyx/core/is2ref.py index a3a0311bb..5faaef110 100644 --- a/icepyx/core/is2ref.py +++ b/icepyx/core/is2ref.py @@ -15,6 +15,7 @@ def _validate_product(product): """ Confirm a valid ICESat-2 product was specified """ + error_msg = "A valid product string was not provided. Check user input, if given, or file metadata." if isinstance(product, str): product = str.upper(product) assert product in [ @@ -40,9 +41,9 @@ def _validate_product(product): "ATL20", "ATL21", "ATL23", - ], "Please enter a valid product" + ], error_msg else: - raise TypeError("Please enter a product string") + raise TypeError(error_msg) return product diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 627395be2..5ef1867f2 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -1,7 +1,9 @@ import fnmatch +import glob import os import warnings +import h5py import numpy as np import xarray as xr @@ -10,8 +12,6 @@ from icepyx.core.variables import Variables as Variables from icepyx.core.variables import list_of_dict_vals -# from icepyx.core.query import Query - def _make_np_datetime(df, keyword): """ @@ -266,24 +266,28 @@ class Read: Parameters ---------- - data_source : string - A string with a full file path or full directory path to ICESat-2 hdf5 (.h5) format files. - Files within a directory must have a consistent filename pattern that includes the "ATL??" data product name. - Files must all be within a single directory. + data_source : string, List + A string or list which specifies the files to be read. The string can be either: 1) the path of a single file 2) the path to a directory or 3) a [glob string](https://docs.python.org/3/library/glob.html). + The List must be a list of strings, each of which is the path of a single file. product : string ICESat-2 data product ID, also known as "short name" (e.g. ATL03). Available data products can be found at: https://nsidc.org/data/icesat-2/data-sets + **Deprecation warning:** This argument is no longer required and will be deprecated in version 1.0.0. The dataset product is read from the file metadata. - filename_pattern : string, default 'ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5' - String that shows the filename pattern as required for Intake's path_as_pattern argument. + filename_pattern : string, default None + String that shows the filename pattern as previously required for Intake's path_as_pattern argument. The default describes files downloaded directly from NSIDC (subsetted and non-subsetted) for most products (e.g. ATL06). The ATL11 filename pattern from NSIDC is: 'ATL{product:2}_{rgt:4}{orbitsegment:2}_{cycles:4}_{version:3}_{revision:2}.h5'. + **Deprecation warning:** This argument is no longer required and will be deprecated in version 1.0.0. catalog : string, default None Full path to an Intake catalog for reading in data. If you still need to create a catalog, leave as default. - **Deprecation warning:** This argument has been depreciated. Please use the data_source argument to pass in valid data. + **Deprecation warning:** This argument has been deprecated. Please use the data_source argument to pass in valid data. + + glob_kwargs : dict, default {} + Additional arguments to be passed into the [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function out_obj_type : object, default xarray.Dataset The desired format for the data to be read in. @@ -296,6 +300,21 @@ class Read: Examples -------- + Reading a single file + >>> ipx.Read('/path/to/data/processed_ATL06_20190226005526_09100205_006_02.h5') # doctest: +SKIP + + Reading all files in a directory + >>> ipx.Read('/path/to/data/') # doctest: +SKIP + + Reading files that match a particular pattern (here, all .h5 files that start with `processed_ATL06_`). + >>> ipx.Read('/path/to/data/processed_ATL06_*.h5') # doctest: +SKIP + + Reading a specific list of files + >>> list_of_files = [ + ... '/path/to/data/processed_ATL06_20190226005526_09100205_006_02.h5', + ... '/path/to/more/data/processed_ATL06_20191202102922_10160505_006_01.h5', + ... ] + >>> ipx.Read(list_of_files) # doctest: +SKIP """ @@ -306,11 +325,12 @@ def __init__( self, data_source=None, product=None, - filename_pattern="ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5", + filename_pattern=None, catalog=None, + glob_kwargs={}, out_obj_type=None, # xr.Dataset, ): - # Raise error for depreciated argument + # Raise error for deprecated argument if catalog: raise DeprecationError( "The `catalog` argument has been deprecated and intake is no longer supported. " @@ -318,43 +338,93 @@ def __init__( ) if data_source is None: - raise ValueError("Please provide a data source.") - else: - self._source_type = _check_datasource(data_source) - self.data_source = data_source + raise ValueError("data_source is a required arguemnt") - if product is None: - raise ValueError( - "Please provide the ICESat-2 data product of your file(s)." + # Raise warnings for deprecated arguments + if filename_pattern: + warnings.warn( + "The `filename_pattern` argument is deprecated. Instead please provide a " + "string, list, or glob string to the `data_source` argument.", + stacklevel=2, ) - else: - self._prod = is2ref._validate_product(product) - pattern_ck, filelist = Read._check_source_for_pattern( - data_source, filename_pattern - ) - assert pattern_ck - # Note: need to check if this works for subset and non-subset NSIDC files (processed_ prepends the former) - self._pattern = filename_pattern - - # this is a first pass at getting rid of mixed product types and warning the user. - # it takes an approach assuming the product name is in the filename, but needs reworking if we let multiple products be loaded - # one way to handle this would be bring in the product info during the loading step and fill in product there instead of requiring it from the user - filtered_filelist = [file for file in filelist if self._prod in file] - if len(filtered_filelist) == 0: + + if product: + product = is2ref._validate_product(product) warnings.warn( - "Your filenames do not contain a product identifier (e.g. ATL06). " - "You will likely need to manually merge your dataframes." + "The `product` argument is no longer required. If the `data_source` argument given " + "contains files with multiple products the `product` argument will be used " + "to filter that list. In all other cases the product argument is ignored. " + "The recommended approach is to not include a `product` argument and instead " + "provide a `data_source` with files of only a single product type`.", + stacklevel=2, ) + + # Create the filelist from the `data_source` argument + if filename_pattern: + # maintained for backward compatibility + pattern_ck, filelist = Read._check_source_for_pattern( + data_source, filename_pattern + ) + assert pattern_ck self._filelist = filelist - elif len(filtered_filelist) < len(filelist): - warnings.warn( - "Some files matching your filename pattern were removed as they were not the specified product." + elif isinstance(data_source, list): + self._filelist = data_source + elif os.path.isdir(data_source): + data_source = os.path.join(data_source, "*") + self._filelist = glob.glob(data_source, **glob_kwargs) + else: + self._filelist = glob.glob(data_source, **glob_kwargs) + # Remove any directories from the list + self._filelist = [f for f in self._filelist if not os.path.isdir(f)] + + # Create a dictionary of the products as read from the metadata + product_dict = {} + for file_ in self._filelist: + product_dict[file_] = self._extract_product(file_) + + # Raise warnings or errors for multiple products or products not matching the user-specified product + all_products = list(set(product_dict.values())) + if len(all_products) > 1: + if product: + warnings.warn( + f"Multiple products found in list of files: {product_dict}. Files that " + "do not match the user specified product will be removed from processing.\n" + "Filtering files using a `product` argument is deprecated. Please use the " + "`data_source` argument to specify a list of files with the same product.", + stacklevel=2, + ) + self._filelist = [] + for key, value in product_dict.items(): + if value == product: + self._filelist.append(key) + if len(self._filelist) == 0: + raise TypeError( + "No files found in the file list matching the user-specified " + "product type" + ) + # Use the cleaned filelist to assign a product + self._product = product + else: + raise TypeError( + f"Multiple product types were found in the file list: {product_dict}." + "Please provide a valid `data_source` parameter indicating files of a single " + "product" + ) + elif len(all_products) == 0: + raise TypeError( + "No files found matching the specified `data_source`. Check your glob " + "string or file list." ) - self._filelist = filtered_filelist else: - self._filelist = filelist - - # after validation, use the notebook code and code outline to start implementing the rest of the class + # Assign the identified product to the property + self._product = all_products[0] + # Raise a warning if the metadata-located product differs from the user-specified product + if product and self._product != product: + warnings.warn( + f"User specified product {product} does not match the product from the file" + " metadata {self._product}", + stacklevel=2, + ) if out_obj_type is not None: print( @@ -387,14 +457,43 @@ def vars(self): if not hasattr(self, "_read_vars"): self._read_vars = Variables( - "file", path=self._filelist[0], product=self._prod + "file", path=self.filelist[0], product=self.product ) return self._read_vars + @property + def filelist(self): + """ + Return the list of files represented by this Read object. + """ + return self._filelist + + @property + def product(self): + """ + Return the product associated with the Read object. + """ + return self._product + # ---------------------------------------------------------------------- # Methods + @staticmethod + def _extract_product(filepath): + """ + Read the product type from the metadata of the file. Return the product as a string. + """ + with h5py.File(filepath, "r") as f: + try: + product = f.attrs["short_name"].decode() + product = is2ref._validate_product(product) + except KeyError: + raise AttributeError( + f"Unable to extract the product name from file metadata." + ) + return product + @staticmethod def _check_source_for_pattern(source, filename_pattern): """ @@ -651,7 +750,7 @@ def load(self): # However, this led to errors when I tried to combine two identical datasets because the single dimension was equal. # In these situations, xarray recommends manually controlling the merge/concat process yourself. # While unlikely to be a broad issue, I've heard of multiple matching timestamps causing issues for combining multiple IS2 datasets. - for file in self._filelist: + for file in self.filelist: all_dss.append( self._build_single_file_dataset(file, groups_list) ) # wanted_groups, vgrp.keys())) @@ -686,7 +785,7 @@ def _build_dataset_template(self, file): gran_idx=[np.uint64(999999)], source_file=(["gran_idx"], [file]), ), - attrs=dict(data_product=self._prod), + attrs=dict(data_product=self.product), ) return is2ds @@ -734,20 +833,11 @@ def _build_single_file_dataset(self, file, groups_list): ------- Xarray Dataset """ - file_product = self._read_single_grp(file, "/").attrs["identifier_product_type"] - assert ( - file_product == self._prod - ), "Your product specification does not match the product specification within your files." - # I think the below method might NOT read the file into memory as the above might? - # import h5py - # with h5py.File(filepath,'r') as h5pt: - # prod_id = h5pt.attrs["identifier_product_type"] - # DEVNOTE: if and elif does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds # if a gridded product # TODO: all products need to be tested, and quicklook products added or explicitly excluded # Level 3b, gridded (netcdf): ATL14, 15, 16, 17, 18, 19, 20, 21 - if self._prod in [ + if self.product in [ "ATL14", "ATL15", "ATL16", @@ -761,7 +851,7 @@ def _build_single_file_dataset(self, file, groups_list): is2ds = xr.open_dataset(file) # Level 3b, hdf5: ATL11 - elif self._prod in ["ATL11"]: + elif self.product in ["ATL11"]: is2ds = self._build_dataset_template(file) # returns the wanted groups as a single list of full group path strings diff --git a/icepyx/tests/test_is2ref.py b/icepyx/tests/test_is2ref.py index b22709c98..fb8d16cad 100644 --- a/icepyx/tests/test_is2ref.py +++ b/icepyx/tests/test_is2ref.py @@ -8,14 +8,14 @@ def test_num_product(): dsnum = 6 - ermsg = "Please enter a product string" + ermsg = "A valid product string was not provided. Check user input, if given, or file metadata." with pytest.raises(TypeError, match=ermsg): is2ref._validate_product(dsnum) def test_bad_product(): wrngds = "atl-6" - ermsg = "Please enter a valid product" + ermsg = "A valid product string was not provided. Check user input, if given, or file metadata." with pytest.raises(AssertionError, match=ermsg): is2ref._validate_product(wrngds)