From bf2a40a2a4e4d8ff9a772b61928c3edbb913f4d0 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Fri, 5 Jan 2024 14:36:23 -0500 Subject: [PATCH] Revert "update Read input arguments (#444)" This reverts commit bae2d896ca67dd659ca942a0fa79d64e4cfe3aff. --- .../example_notebooks/IS2_data_read-in.ipynb | 182 +++++----------- .../documentation/classes_dev_uml.svg | 122 +++++------ .../documentation/classes_user_uml.svg | 21 +- doc/source/user_guide/documentation/read.rst | 2 - icepyx/core/is2ref.py | 5 +- icepyx/core/read.py | 200 +++++------------- icepyx/tests/test_is2ref.py | 4 +- 7 files changed, 183 insertions(+), 353 deletions(-) diff --git a/doc/source/example_notebooks/IS2_data_read-in.ipynb b/doc/source/example_notebooks/IS2_data_read-in.ipynb index 9bbac368b..115c63044 100644 --- a/doc/source/example_notebooks/IS2_data_read-in.ipynb +++ b/doc/source/example_notebooks/IS2_data_read-in.ipynb @@ -63,8 +63,9 @@ "metadata": {}, "outputs": [], "source": [ - "path_root = '/full/path/to/your/ATL06_data/'\n", - "reader = ipx.Read(path_root)" + "path_root = '/full/path/to/your/data/'\n", + "pattern = \"processed_ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"\n", + "reader = ipx.Read(path_root, \"ATL06\", pattern) # or ipx.Read(filepath, \"ATLXX\") if your filenames match the default pattern" ] }, { @@ -110,9 +111,10 @@ "\n", "Reading in ICESat-2 data with icepyx happens in a few simple steps:\n", "1. Let icepyx know where to find your data (this might be local files or urls to data in cloud storage)\n", - "2. Create an icepyx `Read` object\n", - "3. Make a list of the variables you want to read in (does not apply for gridded products)\n", - "4. Load your data into memory (or read it in lazily, if you're using Dask)\n", + "2. Tell icepyx how to interpret the filename format\n", + "3. Create an icepyx `Read` object\n", + "4. Make a list of the variables you want to read in (does not apply for gridded products)\n", + "5. Load your data into memory (or read it in lazily, if you're using Dask)\n", "\n", "We go through each of these steps in more detail in this notebook." ] @@ -166,18 +168,21 @@ { "cell_type": "markdown", "id": "e8da42c1", - "metadata": { - "user_expressions": [] - }, + "metadata": {}, "source": [ "### Step 1: Set data source path\n", "\n", "Provide a full path to the data to be read in (i.e. opened).\n", "Currently accepted inputs are:\n", - "* a string path to directory - all files from the directory will be opened\n", - "* a string path to single file - one file will be opened\n", - "* a list of filepaths - all files in the list will be opened\n", - "* a glob string (see [glob](https://docs.python.org/3/library/glob.html)) - any files matching the glob pattern will be opened" + "* a directory\n", + "* a single file\n", + "\n", + "All files to be read in *must* have a consistent filename pattern.\n", + "If a directory is supplied as the data source, all files in any subdirectories that match the filename pattern will be included.\n", + "\n", + "S3 bucket data access is currently under development, and requires you are registered with NSIDC as a beta tester for cloud-based ICESat-2 data.\n", + "icepyx is working to ensure a smooth transition to working with remote files.\n", + "We'd love your help exploring and testing these features as they become available!" ] }, { @@ -203,135 +208,69 @@ { "cell_type": "code", "execution_count": null, - "id": "fac636c2-e0eb-4e08-adaa-8f47623e46a1", + "id": "e683ebf7", "metadata": {}, "outputs": [], "source": [ - "# list_of_files = ['/my/data/ATL06/processed_ATL06_20190226005526_09100205_006_02.h5', \n", - "# '/my/other/data/ATL06/processed_ATL06_20191202102922_10160505_006_01.h5']" - ] - }, - { - "cell_type": "markdown", - "id": "ba3ebeb0-3091-4712-b0f7-559ddb95ca5a", - "metadata": { - "user_expressions": [] - }, - "source": [ - "#### Glob Strings\n", - "\n", - "[glob](https://docs.python.org/3/library/glob.html) is a Python library which allows users to list files in their file systems whose paths match a given pattern. Icepyx uses the glob library to give users greater flexibility over their input file lists.\n", - "\n", - "glob works using `*` and `?` as wildcard characters, where `*` matches any number of characters and `?` matches a single character. For example:\n", - "\n", - "* `/this/path/*.h5`: refers to all `.h5` files in the `/this/path` folder (Example matches: \"/this/path/processed_ATL03_20191130221008_09930503_006_01.h5\" or \"/this/path/myfavoriteicsat-2file.h5\")\n", - "* `/this/path/*ATL07*.h5`: refers to all `.h5` files in the `/this/path` folder that have ATL07 in the filename. (Example matches: \"/this/path/ATL07-02_20221012220720_03391701_005_01.h5\" or \"/this/path/processed_ATL07.h5\")\n", - "* `/this/path/ATL??/*.h5`: refers to all `.h5` files that are in a subfolder of `/this/path` and a subdirectory of `ATL` followed by any 2 characters (Example matches: \"/this/path/ATL03/processed_ATL03_20191130221008_09930503_006_01.h5\", \"/this/path/ATL06/myfile.h5\")\n", - "\n", - "See the glob documentation or other online explainer tutorials for more in depth explanation, or advanced glob paths such as character classes and ranges." - ] - }, - { - "cell_type": "markdown", - "id": "20286c76-5632-4420-b2c9-a5a6b1952672", - "metadata": { - "user_expressions": [] - }, - "source": [ - "#### Recursive Directory Search" + "# urlpath = 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/004/2019/11/30/ATL03_20191130221008_09930503_004_01.h5'" ] }, { "cell_type": "markdown", - "id": "632bd1ce-2397-4707-a63f-9d5d2fc02fbc", + "id": "92743496", "metadata": { "user_expressions": [] }, "source": [ - "glob will not by default search all of the subdirectories for matching filepaths, but it has the ability to do so.\n", + "### Step 2: Create a filename pattern for your data files\n", "\n", - "If you would like to search recursively, you can achieve this by either:\n", - "1. passing the `recursive` argument into `glob_kwargs` and including `\\**\\` in your filepath\n", - "2. using glob directly to create a list of filepaths\n", + "Files provided by NSIDC typically match the format `\"ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"` where the parameters in curly brackets indicate a parameter name (left of the colon) and character length or format (right of the colon).\n", + "Some of this information is used during data opening to help correctly read and label the data within the data structure, particularly when multiple files are opened simultaneously.\n", "\n", - "Each of these two methods are shown below." - ] - }, - { - "cell_type": "markdown", - "id": "da0cacd8-9ddc-4c31-86b6-167d850b989e", - "metadata": { - "user_expressions": [] - }, - "source": [ - "Method 1: passing the `recursive` argument into `glob_kwargs`" + "By default, icepyx will assume your filenames follow the default format.\n", + "However, you can easily read in other ICESat-2 data files by supplying your own filename pattern.\n", + "For instance, `pattern=\"ATL{product:2}-{datetime:%Y%m%d%H%M%S}-Sample.h5\"`. A few example patterns are provided below." ] }, { "cell_type": "code", "execution_count": null, - "id": "e276b876-9ec7-4991-8520-05c97824b896", + "id": "7318abd0", "metadata": {}, "outputs": [], "source": [ - "ipx.Read('/path/to/**/folder', glob_kwargs={'recursive': True})" - ] - }, - { - "cell_type": "markdown", - "id": "f5a1e85e-fc4a-405f-9710-0cb61b827f2c", - "metadata": { - "user_expressions": [] - }, - "source": [ - "You can use `glob_kwargs` for any additional argument to Python's builtin `glob.glob` that you would like to pass in via icepyx." - ] - }, - { - "cell_type": "markdown", - "id": "76de9539-710c-49f6-9e9e-238849382c33", - "metadata": { - "user_expressions": [] - }, - "source": [ - "Method 2: using glob directly to create a list of filepaths" + "# pattern = 'ATL06-{datetime:%Y%m%d%H%M%S}-Sample.h5'\n", + "# pattern = 'ATL{product:2}-{datetime:%Y%m%d%H%M%S}-Sample.h5'" ] }, { "cell_type": "code", "execution_count": null, - "id": "be79b0dd-efcf-4d50-bdb0-8e3ae8e8e38c", + "id": "f43e8664", "metadata": {}, "outputs": [], "source": [ - "import glob" + "# pattern = \"ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "5d088571-496d-479a-9fb7-833ed7e98676", - "metadata": { - "tags": [] - }, + "id": "992a77fb", + "metadata": {}, "outputs": [], "source": [ - "list_of_files = glob.glob('/path/to/**/folder', recursive=True)\n", - "ipx.Read(list_of_files)" + "# grid_pattern = \"ATL{product:2}_GL_0311_{res:3}m_{version:3}_{revision:2}.nc\"" ] }, { - "cell_type": "markdown", - "id": "08df2874-7c54-4670-8f37-9135ea296ff5", - "metadata": { - "user_expressions": [] - }, + "cell_type": "code", + "execution_count": null, + "id": "6aec1a70", + "metadata": {}, + "outputs": [], "source": [ - "```{admonition} Read Module Update\n", - "Previously, icepyx required two additional conditions: 1) a `product` argument and 2) that your files either matched the default `filename_pattern` or that the user provided their own `filename_pattern`. These two requirements have been removed. `product` is now read directly from the file metadata (the root group's `short_name` attribute). Flexibility to specify multiple files via the `filename_pattern` has been replaced with the [glob string](https://docs.python.org/3/library/glob.html) feature, and by allowing a list of filepaths as an argument.\n", - "\n", - "The `product` and `filename_pattern` arguments have been maintained for backwards compatibility, but will be fully removed in icepyx version 1.0.0.\n", - "```" + "pattern = \"processed_ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5\"" ] }, { @@ -341,9 +280,14 @@ "user_expressions": [] }, "source": [ - "### Step 2: Create an icepyx read object\n", + "### Step 3: Create an icepyx read object\n", "\n", - "Using the `data_source` described in Step 1, we can create our Read object." + "The `Read` object has two required inputs:\n", + "- `path` = a string with the full file path or full directory path to your hdf5 (.h5) format files.\n", + "- `product` = the data product you're working with, also known as the \"short name\".\n", + "\n", + "The `Read` object also accepts the optional keyword input:\n", + "- `pattern` = a formatted string indicating the filename pattern required for Intake's path_as_pattern argument." ] }, { @@ -355,17 +299,7 @@ }, "outputs": [], "source": [ - "reader = ipx.Read(data_source=path_root)" - ] - }, - { - "cell_type": "markdown", - "id": "7b2acfdb-75eb-4c64-b583-2ab19326aaee", - "metadata": { - "user_expressions": [] - }, - "source": [ - "The Read object now contains the list of matching files that will eventually be loaded into Python. You can inspect its properties, such as the files that were located or the identified product, directly on the Read object." + "reader = ipx.Read(data_source=path_root, product=\"ATL06\", filename_pattern=pattern) # or ipx.Read(filepath, \"ATLXX\") if your filenames match the default pattern" ] }, { @@ -375,17 +309,7 @@ "metadata": {}, "outputs": [], "source": [ - "reader.filelist" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7455ee3f-f9ab-486e-b4c7-2fa2314d4084", - "metadata": {}, - "outputs": [], - "source": [ - "reader.product" + "reader._filelist" ] }, { @@ -395,7 +319,7 @@ "user_expressions": [] }, "source": [ - "### Step 3: Specify variables to be read in\n", + "### Step 4: Specify variables to be read in\n", "\n", "To load your data into memory or prepare it for analysis, icepyx needs to know which variables you'd like to read in.\n", "If you've used icepyx to download data from NSIDC with variable subsetting (which is the default), then you may already be familiar with the icepyx `Variables` module and how to create and modify lists of variables.\n", @@ -502,7 +426,7 @@ "user_expressions": [] }, "source": [ - "### Step 4: Loading your data\n", + "### Step 5: Loading your data\n", "\n", "Now that you've set up all the options, you're ready to read your ICESat-2 data into memory!" ] @@ -617,9 +541,9 @@ ], "metadata": { "kernelspec": { - "display_name": "icepyx-dev", + "display_name": "general", "language": "python", - "name": "icepyx-dev" + "name": "general" }, "language_info": { "codemirror_mode": { diff --git a/doc/source/user_guide/documentation/classes_dev_uml.svg b/doc/source/user_guide/documentation/classes_dev_uml.svg index 0cd08c9e9..34e13b41c 100644 --- a/doc/source/user_guide/documentation/classes_dev_uml.svg +++ b/doc/source/user_guide/documentation/classes_dev_uml.svg @@ -4,11 +4,11 @@ - + classes_dev_uml - + icepyx.core.auth.AuthenticationError @@ -139,38 +139,38 @@ icepyx.core.icesat2data.Icesat2Data - -Icesat2Data - - -__init__() + +Icesat2Data + + +__init__() icepyx.core.exceptions.NsidcQueryError - -NsidcQueryError - -errmsg -msgtxt : str - -__init__(errmsg, msgtxt) -__str__() + +NsidcQueryError + +errmsg +msgtxt : str + +__init__(errmsg, msgtxt) +__str__() icepyx.core.exceptions.QueryError - -QueryError - - - + +QueryError + + + icepyx.core.exceptions.NsidcQueryError->icepyx.core.exceptions.QueryError - - + + @@ -235,24 +235,24 @@ icepyx.core.read.Read - -Read - -_filelist : NoneType, list -_out_obj : Dataset -_product : NoneType, str -_read_vars -filelist -product -vars - -__init__(data_source, product, filename_pattern, catalog, glob_kwargs, out_obj_type) -_add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict) -_build_dataset_template(file) -_build_single_file_dataset(file, groups_list) -_check_source_for_pattern(source, filename_pattern) -_combine_nested_vars(is2ds, ds, grp_path, wanted_dict) -_extract_product(filepath) + +Read + +_filelist : NoneType, list +_out_obj : Dataset +_pattern : str +_prod : str +_read_vars +_source_type : str +data_source +vars + +__init__(data_source, product, filename_pattern, catalog, out_obj_type) +_add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict) +_build_dataset_template(file) +_build_single_file_dataset(file, groups_list) +_check_source_for_pattern(source, filename_pattern) +_combine_nested_vars(is2ds, ds, grp_path, wanted_dict) _read_single_grp(file, grp_path) load() @@ -366,30 +366,30 @@ icepyx.core.variables.Variables->icepyx.core.read.Read - - -_read_vars + + +_read_vars icepyx.core.visualization.Visualize - -Visualize - -bbox : list -cycles : NoneType -date_range : NoneType -product : NoneType, str -tracks : NoneType - -__init__(query_obj, product, spatial_extent, date_range, cycles, tracks) -generate_OA_parameters(): list -grid_bbox(binsize): list -make_request(base_url, payload) -parallel_request_OA(): da.array -query_icesat2_filelist(): tuple -request_OA_data(paras): da.array -viz_elevation(): (hv.DynamicMap, hv.Layout) + +Visualize + +bbox : list +cycles : NoneType +date_range : NoneType +product : NoneType, str +tracks : NoneType + +__init__(query_obj, product, spatial_extent, date_range, cycles, tracks) +generate_OA_parameters(): list +grid_bbox(binsize): list +make_request(base_url, payload) +parallel_request_OA(): da.array +query_icesat2_filelist(): tuple +request_OA_data(paras): da.array +viz_elevation(): (hv.DynamicMap, hv.Layout) diff --git a/doc/source/user_guide/documentation/classes_user_uml.svg b/doc/source/user_guide/documentation/classes_user_uml.svg index a9c116469..640f76815 100644 --- a/doc/source/user_guide/documentation/classes_user_uml.svg +++ b/doc/source/user_guide/documentation/classes_user_uml.svg @@ -201,14 +201,13 @@ icepyx.core.read.Read - -Read - -filelist -product -vars - -load() + +Read + +data_source +vars + +load() @@ -301,9 +300,9 @@ icepyx.core.variables.Variables->icepyx.core.read.Read - - -_read_vars + + +_read_vars diff --git a/doc/source/user_guide/documentation/read.rst b/doc/source/user_guide/documentation/read.rst index 68da03b1d..a5beedf4e 100644 --- a/doc/source/user_guide/documentation/read.rst +++ b/doc/source/user_guide/documentation/read.rst @@ -19,8 +19,6 @@ Attributes .. autosummary:: :toctree: ../../_icepyx/ - Read.filelist - Read.product Read.vars diff --git a/icepyx/core/is2ref.py b/icepyx/core/is2ref.py index 5faaef110..a3a0311bb 100644 --- a/icepyx/core/is2ref.py +++ b/icepyx/core/is2ref.py @@ -15,7 +15,6 @@ def _validate_product(product): """ Confirm a valid ICESat-2 product was specified """ - error_msg = "A valid product string was not provided. Check user input, if given, or file metadata." if isinstance(product, str): product = str.upper(product) assert product in [ @@ -41,9 +40,9 @@ def _validate_product(product): "ATL20", "ATL21", "ATL23", - ], error_msg + ], "Please enter a valid product" else: - raise TypeError(error_msg) + raise TypeError("Please enter a product string") return product diff --git a/icepyx/core/read.py b/icepyx/core/read.py index 5ef1867f2..627395be2 100644 --- a/icepyx/core/read.py +++ b/icepyx/core/read.py @@ -1,9 +1,7 @@ import fnmatch -import glob import os import warnings -import h5py import numpy as np import xarray as xr @@ -12,6 +10,8 @@ from icepyx.core.variables import Variables as Variables from icepyx.core.variables import list_of_dict_vals +# from icepyx.core.query import Query + def _make_np_datetime(df, keyword): """ @@ -266,28 +266,24 @@ class Read: Parameters ---------- - data_source : string, List - A string or list which specifies the files to be read. The string can be either: 1) the path of a single file 2) the path to a directory or 3) a [glob string](https://docs.python.org/3/library/glob.html). - The List must be a list of strings, each of which is the path of a single file. + data_source : string + A string with a full file path or full directory path to ICESat-2 hdf5 (.h5) format files. + Files within a directory must have a consistent filename pattern that includes the "ATL??" data product name. + Files must all be within a single directory. product : string ICESat-2 data product ID, also known as "short name" (e.g. ATL03). Available data products can be found at: https://nsidc.org/data/icesat-2/data-sets - **Deprecation warning:** This argument is no longer required and will be deprecated in version 1.0.0. The dataset product is read from the file metadata. - filename_pattern : string, default None - String that shows the filename pattern as previously required for Intake's path_as_pattern argument. + filename_pattern : string, default 'ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5' + String that shows the filename pattern as required for Intake's path_as_pattern argument. The default describes files downloaded directly from NSIDC (subsetted and non-subsetted) for most products (e.g. ATL06). The ATL11 filename pattern from NSIDC is: 'ATL{product:2}_{rgt:4}{orbitsegment:2}_{cycles:4}_{version:3}_{revision:2}.h5'. - **Deprecation warning:** This argument is no longer required and will be deprecated in version 1.0.0. catalog : string, default None Full path to an Intake catalog for reading in data. If you still need to create a catalog, leave as default. - **Deprecation warning:** This argument has been deprecated. Please use the data_source argument to pass in valid data. - - glob_kwargs : dict, default {} - Additional arguments to be passed into the [glob.glob()](https://docs.python.org/3/library/glob.html#glob.glob)function + **Deprecation warning:** This argument has been depreciated. Please use the data_source argument to pass in valid data. out_obj_type : object, default xarray.Dataset The desired format for the data to be read in. @@ -300,21 +296,6 @@ class Read: Examples -------- - Reading a single file - >>> ipx.Read('/path/to/data/processed_ATL06_20190226005526_09100205_006_02.h5') # doctest: +SKIP - - Reading all files in a directory - >>> ipx.Read('/path/to/data/') # doctest: +SKIP - - Reading files that match a particular pattern (here, all .h5 files that start with `processed_ATL06_`). - >>> ipx.Read('/path/to/data/processed_ATL06_*.h5') # doctest: +SKIP - - Reading a specific list of files - >>> list_of_files = [ - ... '/path/to/data/processed_ATL06_20190226005526_09100205_006_02.h5', - ... '/path/to/more/data/processed_ATL06_20191202102922_10160505_006_01.h5', - ... ] - >>> ipx.Read(list_of_files) # doctest: +SKIP """ @@ -325,12 +306,11 @@ def __init__( self, data_source=None, product=None, - filename_pattern=None, + filename_pattern="ATL{product:2}_{datetime:%Y%m%d%H%M%S}_{rgt:4}{cycle:2}{orbitsegment:2}_{version:3}_{revision:2}.h5", catalog=None, - glob_kwargs={}, out_obj_type=None, # xr.Dataset, ): - # Raise error for deprecated argument + # Raise error for depreciated argument if catalog: raise DeprecationError( "The `catalog` argument has been deprecated and intake is no longer supported. " @@ -338,93 +318,43 @@ def __init__( ) if data_source is None: - raise ValueError("data_source is a required arguemnt") + raise ValueError("Please provide a data source.") + else: + self._source_type = _check_datasource(data_source) + self.data_source = data_source - # Raise warnings for deprecated arguments - if filename_pattern: - warnings.warn( - "The `filename_pattern` argument is deprecated. Instead please provide a " - "string, list, or glob string to the `data_source` argument.", - stacklevel=2, + if product is None: + raise ValueError( + "Please provide the ICESat-2 data product of your file(s)." ) - - if product: - product = is2ref._validate_product(product) + else: + self._prod = is2ref._validate_product(product) + pattern_ck, filelist = Read._check_source_for_pattern( + data_source, filename_pattern + ) + assert pattern_ck + # Note: need to check if this works for subset and non-subset NSIDC files (processed_ prepends the former) + self._pattern = filename_pattern + + # this is a first pass at getting rid of mixed product types and warning the user. + # it takes an approach assuming the product name is in the filename, but needs reworking if we let multiple products be loaded + # one way to handle this would be bring in the product info during the loading step and fill in product there instead of requiring it from the user + filtered_filelist = [file for file in filelist if self._prod in file] + if len(filtered_filelist) == 0: warnings.warn( - "The `product` argument is no longer required. If the `data_source` argument given " - "contains files with multiple products the `product` argument will be used " - "to filter that list. In all other cases the product argument is ignored. " - "The recommended approach is to not include a `product` argument and instead " - "provide a `data_source` with files of only a single product type`.", - stacklevel=2, + "Your filenames do not contain a product identifier (e.g. ATL06). " + "You will likely need to manually merge your dataframes." ) - - # Create the filelist from the `data_source` argument - if filename_pattern: - # maintained for backward compatibility - pattern_ck, filelist = Read._check_source_for_pattern( - data_source, filename_pattern - ) - assert pattern_ck self._filelist = filelist - elif isinstance(data_source, list): - self._filelist = data_source - elif os.path.isdir(data_source): - data_source = os.path.join(data_source, "*") - self._filelist = glob.glob(data_source, **glob_kwargs) - else: - self._filelist = glob.glob(data_source, **glob_kwargs) - # Remove any directories from the list - self._filelist = [f for f in self._filelist if not os.path.isdir(f)] - - # Create a dictionary of the products as read from the metadata - product_dict = {} - for file_ in self._filelist: - product_dict[file_] = self._extract_product(file_) - - # Raise warnings or errors for multiple products or products not matching the user-specified product - all_products = list(set(product_dict.values())) - if len(all_products) > 1: - if product: - warnings.warn( - f"Multiple products found in list of files: {product_dict}. Files that " - "do not match the user specified product will be removed from processing.\n" - "Filtering files using a `product` argument is deprecated. Please use the " - "`data_source` argument to specify a list of files with the same product.", - stacklevel=2, - ) - self._filelist = [] - for key, value in product_dict.items(): - if value == product: - self._filelist.append(key) - if len(self._filelist) == 0: - raise TypeError( - "No files found in the file list matching the user-specified " - "product type" - ) - # Use the cleaned filelist to assign a product - self._product = product - else: - raise TypeError( - f"Multiple product types were found in the file list: {product_dict}." - "Please provide a valid `data_source` parameter indicating files of a single " - "product" - ) - elif len(all_products) == 0: - raise TypeError( - "No files found matching the specified `data_source`. Check your glob " - "string or file list." - ) - else: - # Assign the identified product to the property - self._product = all_products[0] - # Raise a warning if the metadata-located product differs from the user-specified product - if product and self._product != product: + elif len(filtered_filelist) < len(filelist): warnings.warn( - f"User specified product {product} does not match the product from the file" - " metadata {self._product}", - stacklevel=2, + "Some files matching your filename pattern were removed as they were not the specified product." ) + self._filelist = filtered_filelist + else: + self._filelist = filelist + + # after validation, use the notebook code and code outline to start implementing the rest of the class if out_obj_type is not None: print( @@ -457,43 +387,14 @@ def vars(self): if not hasattr(self, "_read_vars"): self._read_vars = Variables( - "file", path=self.filelist[0], product=self.product + "file", path=self._filelist[0], product=self._prod ) return self._read_vars - @property - def filelist(self): - """ - Return the list of files represented by this Read object. - """ - return self._filelist - - @property - def product(self): - """ - Return the product associated with the Read object. - """ - return self._product - # ---------------------------------------------------------------------- # Methods - @staticmethod - def _extract_product(filepath): - """ - Read the product type from the metadata of the file. Return the product as a string. - """ - with h5py.File(filepath, "r") as f: - try: - product = f.attrs["short_name"].decode() - product = is2ref._validate_product(product) - except KeyError: - raise AttributeError( - f"Unable to extract the product name from file metadata." - ) - return product - @staticmethod def _check_source_for_pattern(source, filename_pattern): """ @@ -750,7 +651,7 @@ def load(self): # However, this led to errors when I tried to combine two identical datasets because the single dimension was equal. # In these situations, xarray recommends manually controlling the merge/concat process yourself. # While unlikely to be a broad issue, I've heard of multiple matching timestamps causing issues for combining multiple IS2 datasets. - for file in self.filelist: + for file in self._filelist: all_dss.append( self._build_single_file_dataset(file, groups_list) ) # wanted_groups, vgrp.keys())) @@ -785,7 +686,7 @@ def _build_dataset_template(self, file): gran_idx=[np.uint64(999999)], source_file=(["gran_idx"], [file]), ), - attrs=dict(data_product=self.product), + attrs=dict(data_product=self._prod), ) return is2ds @@ -833,11 +734,20 @@ def _build_single_file_dataset(self, file, groups_list): ------- Xarray Dataset """ + file_product = self._read_single_grp(file, "/").attrs["identifier_product_type"] + assert ( + file_product == self._prod + ), "Your product specification does not match the product specification within your files." + # I think the below method might NOT read the file into memory as the above might? + # import h5py + # with h5py.File(filepath,'r') as h5pt: + # prod_id = h5pt.attrs["identifier_product_type"] + # DEVNOTE: if and elif does not actually apply wanted variable list, and has not been tested for merging multiple files into one ds # if a gridded product # TODO: all products need to be tested, and quicklook products added or explicitly excluded # Level 3b, gridded (netcdf): ATL14, 15, 16, 17, 18, 19, 20, 21 - if self.product in [ + if self._prod in [ "ATL14", "ATL15", "ATL16", @@ -851,7 +761,7 @@ def _build_single_file_dataset(self, file, groups_list): is2ds = xr.open_dataset(file) # Level 3b, hdf5: ATL11 - elif self.product in ["ATL11"]: + elif self._prod in ["ATL11"]: is2ds = self._build_dataset_template(file) # returns the wanted groups as a single list of full group path strings diff --git a/icepyx/tests/test_is2ref.py b/icepyx/tests/test_is2ref.py index fb8d16cad..b22709c98 100644 --- a/icepyx/tests/test_is2ref.py +++ b/icepyx/tests/test_is2ref.py @@ -8,14 +8,14 @@ def test_num_product(): dsnum = 6 - ermsg = "A valid product string was not provided. Check user input, if given, or file metadata." + ermsg = "Please enter a product string" with pytest.raises(TypeError, match=ermsg): is2ref._validate_product(dsnum) def test_bad_product(): wrngds = "atl-6" - ermsg = "A valid product string was not provided. Check user input, if given, or file metadata." + ermsg = "Please enter a valid product" with pytest.raises(AssertionError, match=ermsg): is2ref._validate_product(wrngds)