From d355ef5030c80faa50e4f440cddee052597dd53b Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 12 Nov 2024 21:00:01 +0100 Subject: [PATCH 1/6] bugfix for earlier unstacking dim order fix in datastores --- neural_lam/datastore/base.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index 25e7d01..418b3b3 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -466,19 +466,27 @@ def stack_grid_coords( da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS) # find the feature dimension, which has named with the format # `{category}_feature` + + # ensure that grid_index is the first dimension, and the feature + # dimension is the second + dim_order = ["grid_index"] + potential_feature_dims = [ - d for d in da_or_ds_stacked.dims if d.endswith("_feature") + d for d in da_or_ds_stacked.dims if d.endswith("feature") ] - if not len(potential_feature_dims) == 1: + n_feature_dims = len(potential_feature_dims) + if n_feature_dims == 0: + pass + elif n_feature_dims == 1: + feature_dim = potential_feature_dims[0] + dim_order.append(feature_dim) + else: raise ValueError( "Expected exactly one feature dimension in the stacked data, " f"got {potential_feature_dims}" ) - feature_dim = potential_feature_dims[0] - # ensure that grid_index is the first dimension, and the feature - # dimension is the second - return da_or_ds_stacked.transpose("grid_index", feature_dim, ...) + return da_or_ds_stacked.transpose(*dim_order, ...) @property @functools.lru_cache From 1121d9f12aeb34ab5e1d32c87d008201c9ec0754 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 13 Nov 2024 13:36:04 +0100 Subject: [PATCH 2/6] add enforcement of datastores output dimension order --- neural_lam/datastore/base.py | 72 +++++++++++++++------- neural_lam/datastore/mdp.py | 8 +-- neural_lam/datastore/npyfilesmeps/store.py | 8 ++- tests/test_datastores.py | 2 +- 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index 418b3b3..8784ff0 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -329,6 +329,39 @@ def state_feature_weights_values(self) -> List[float]: """ pass + @functools.lru_cache + def expected_dim_order(self, category: str) -> List[str]: + """ + Return the expected dimension order for the dataarray or dataset + returned by `get_dataarray` for the given category of data. The + dimension order is the order of the dimensions in the dataarray or + dataset, and is used to check that the data is in the expected format. + + This is necessary so that when stacking and unstacking the spatial grid + we can ensure that the dimension order is the same as what is returned + from `get_dataarray`. And also ensures that downstream uses of a + datastore (e.g. WeatherDataset) sees the data in a common structure. + + Parameters + ---------- + category : str + The category of the dataset (state/forcing/static). + + Returns + ------- + List[str] + The expected dimension order for the dataarray or dataset. + + """ + dim_order = ["grid_index", f"{category}_feature"] + if self.is_forecast: + dim_order.extend(["analysis_time", "elapsed_forecast_duration"]) + elif not self.is_forecast: + dim_order.append("time") + if self.is_ensemble: + dim_order.append("ensemble_member") + return dim_order + @dataclasses.dataclass class CartesianGridShape: @@ -464,29 +497,22 @@ def stack_grid_coords( return da_or_ds da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS) - # find the feature dimension, which has named with the format - # `{category}_feature` - - # ensure that grid_index is the first dimension, and the feature - # dimension is the second - dim_order = ["grid_index"] - - potential_feature_dims = [ - d for d in da_or_ds_stacked.dims if d.endswith("feature") - ] - n_feature_dims = len(potential_feature_dims) - if n_feature_dims == 0: - pass - elif n_feature_dims == 1: - feature_dim = potential_feature_dims[0] - dim_order.append(feature_dim) - else: - raise ValueError( - "Expected exactly one feature dimension in the stacked data, " - f"got {potential_feature_dims}" - ) - - return da_or_ds_stacked.transpose(*dim_order, ...) + + # infer what category of data by finding the dimension named in the + # format `{category}_feature` + category = None + for dim in da_or_ds_stacked.dims: + if dim.endswith("_feature"): + if category is not None: + raise ValueError( + "Multiple dimensions ending with '_feature' found in " + f"dataarray: {da_or_ds_stacked}. Cannot infer category." + ) + category = dim.split("_")[0] + + dim_order = self.expected_dim_order(category=category) + + return da_or_ds_stacked.transpose(dim_order) @property @functools.lru_cache diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index 2e438af..f698ddf 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -266,9 +266,7 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray: # set multi-index for grid-index da_category = da_category.set_index(grid_index=self.CARTESIAN_COORDS) - if "time" not in da_category.dims: - return da_category - else: + if "time" in da_category.dims: t_start = ( self._ds.splits.sel(split_name=split) .sel(split_part="start") @@ -281,7 +279,9 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray: .load() .item() ) - return da_category.sel(time=slice(t_start, t_end)) + da_category = da_category.sel(time=slice(t_start, t_end)) + + return da_category.transpose(self.expected_dim_order(category=category)) def get_standardization_dataarray(self, category: str) -> xr.Dataset: """ diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py index ffa70dc..37693ea 100644 --- a/neural_lam/datastore/npyfilesmeps/store.py +++ b/neural_lam/datastore/npyfilesmeps/store.py @@ -299,6 +299,8 @@ def get_dataarray(self, category: str, split: str) -> DataArray: f"Expected features {expected_features}, got {actual_features}" ) + da = da.transpose(self.expected_dim_order(category=category)) + return da def _get_single_timeseries_dataarray( @@ -346,7 +348,11 @@ def _get_single_timeseries_dataarray( None, ), "Unknown dataset split" else: - assert split in ("train", "val", "test"), "Unknown dataset split" + assert split in ( + "train", + "val", + "test", + ), f"Unknown dataset split {split} for features {features}" if member is not None and features != self.get_vars_names( category="state" diff --git a/tests/test_datastores.py b/tests/test_datastores.py index c28418c..096efcb 100644 --- a/tests/test_datastores.py +++ b/tests/test_datastores.py @@ -319,7 +319,7 @@ def test_stacking_grid_coords(datastore_name, category): if not isinstance(datastore, BaseRegularGridDatastore): pytest.skip("Datastore does not implement `BaseCartesianDatastore`") - da_static = datastore.get_dataarray(category=category, split=None) + da_static = datastore.get_dataarray(category=category, split="train") da_static_unstacked = datastore.unstack_grid_coords(da_static).load() da_static_test = datastore.stack_grid_coords(da_static_unstacked) From 9afaf6e9a73c2cc814064814b62e58532653d27b Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 13 Nov 2024 14:14:24 +0100 Subject: [PATCH 3/6] fix bugs introduced with dimension order during stack/unstack --- neural_lam/datastore/base.py | 36 +++++++++++++++------- neural_lam/datastore/mdp.py | 3 +- neural_lam/datastore/npyfilesmeps/store.py | 13 ++++---- tests/test_config.py | 2 +- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py index 8784ff0..f24d7b3 100644 --- a/neural_lam/datastore/base.py +++ b/neural_lam/datastore/base.py @@ -330,7 +330,7 @@ def state_feature_weights_values(self) -> List[float]: pass @functools.lru_cache - def expected_dim_order(self, category: str) -> List[str]: + def expected_dim_order(self, category: str = None) -> List[str]: """ Return the expected dimension order for the dataarray or dataset returned by `get_dataarray` for the given category of data. The @@ -342,6 +342,9 @@ def expected_dim_order(self, category: str) -> List[str]: from `get_dataarray`. And also ensures that downstream uses of a datastore (e.g. WeatherDataset) sees the data in a common structure. + If the category is None, then the it assumed that data only represents + a 1D scalar field varying with grid-index. + Parameters ---------- category : str @@ -353,13 +356,24 @@ def expected_dim_order(self, category: str) -> List[str]: The expected dimension order for the dataarray or dataset. """ - dim_order = ["grid_index", f"{category}_feature"] - if self.is_forecast: - dim_order.extend(["analysis_time", "elapsed_forecast_duration"]) - elif not self.is_forecast: - dim_order.append("time") - if self.is_ensemble: - dim_order.append("ensemble_member") + dim_order = ["grid_index"] + + if category is not None: + dim_order.append(f"{category}_feature") + + if category != "static": + # static data does not vary in time + if self.is_forecast: + dim_order.extend( + ["analysis_time", "elapsed_forecast_duration"] + ) + elif not self.is_forecast: + dim_order.append("time") + + if self.is_ensemble and category == "state": + # XXX: for now we only assume ensemble data for state variables + dim_order.append("ensemble_member") + return dim_order @@ -498,8 +512,8 @@ def stack_grid_coords( da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS) - # infer what category of data by finding the dimension named in the - # format `{category}_feature` + # infer what category of data the array represents by finding the + # dimension named in the format `{category}_feature` category = None for dim in da_or_ds_stacked.dims: if dim.endswith("_feature"): @@ -512,7 +526,7 @@ def stack_grid_coords( dim_order = self.expected_dim_order(category=category) - return da_or_ds_stacked.transpose(dim_order) + return da_or_ds_stacked.transpose(*dim_order) @property @functools.lru_cache diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index f698ddf..df50771 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -281,7 +281,8 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray: ) da_category = da_category.sel(time=slice(t_start, t_end)) - return da_category.transpose(self.expected_dim_order(category=category)) + dim_order = self.expected_dim_order(category=category) + return da_category.transpose(*dim_order) def get_standardization_dataarray(self, category: str) -> xr.Dataset: """ diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py index 37693ea..5fd2d63 100644 --- a/neural_lam/datastore/npyfilesmeps/store.py +++ b/neural_lam/datastore/npyfilesmeps/store.py @@ -282,15 +282,16 @@ def get_dataarray(self, category: str, split: str) -> DataArray: features=features, split=split ) das.append(da) - da = xr.concat(das, dim="feature").transpose( - "grid_index", "feature" - ) + da = xr.concat(das, dim="feature") else: raise NotImplementedError(category) da = da.rename(dict(feature=f"{category}_feature")) + # stack the [x, y] dimensions into a `grid_index` dimension + da = self.stack_grid_coords(da) + # check that we have the right features actual_features = da[f"{category}_feature"].values.tolist() expected_features = self.get_vars_names(category=category) @@ -299,7 +300,8 @@ def get_dataarray(self, category: str, split: str) -> DataArray: f"Expected features {expected_features}, got {actual_features}" ) - da = da.transpose(self.expected_dim_order(category=category)) + dim_order = self.expected_dim_order(category=category) + da = da.transpose(*dim_order) return da @@ -501,9 +503,6 @@ def _get_single_timeseries_dataarray( da = xr.DataArray(arr_all, dims=dims, coords=coords) - # stack the [x, y] dimensions into a `grid_index` dimension - da = self.stack_grid_coords(da) - return da def _get_analysis_times(self, split) -> List[np.datetime64]: diff --git a/tests/test_config.py b/tests/test_config.py index 4bb7c1c..1ff40bc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -46,7 +46,7 @@ def test_config_serialization(state_weighting_config): training: state_feature_weighting: __config_class__: ManualStateFeatureWeighting - values: + weights: u100m: 1.0 v100m: 1.0 """ From 3df627f269f3c34b2958a56c549eb7b382f32183 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 13 Nov 2024 16:09:04 +0100 Subject: [PATCH 4/6] update meps test to point to new dataset on aws --- tests/conftest.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3dfac91..658f4c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,11 +23,8 @@ # Initializing variables for the s3 client S3_BUCKET_NAME = "mllam-testdata" -# S3_ENDPOINT_URL = "https://object-store.os-api.cci1.ecmwf.int" -S3_ENDPOINT_URL = "http://localhost:8000" -# S3_FILE_PATH = "neural-lam/npy/meps_example_reduced.v0.1.0.zip" -# TODO: I will upload this to AWS S3 once I have located the credentials... -S3_FILE_PATH = "meps_example_reduced.v0.2.0.zip" +S3_ENDPOINT_URL = "https://object-store.os-api.cci1.ecmwf.int" +S3_FILE_PATH = "neural-lam/npy/meps_example_reduced.v0.2.0.zip" S3_FULL_PATH = "/".join([S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_FILE_PATH]) TEST_DATA_KNOWN_HASH = ( "7ff2e07e04cfcd77631115f800c9d49188bb2a7c2a2777da3cea219f926d0c86" From 89fac82420cbb894bbd00719b09055b1043c8b23 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 13 Nov 2024 16:09:31 +0100 Subject: [PATCH 5/6] remove unused print statement --- tests/test_datastores.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_datastores.py b/tests/test_datastores.py index 096efcb..4a4b110 100644 --- a/tests/test_datastores.py +++ b/tests/test_datastores.py @@ -339,7 +339,6 @@ def test_dataarray_shapes(datastore_name): unstacked_tensor = torch.tensor( datastore.unstack_grid_coords(static_da).to_numpy(), dtype=torch.float32 ).squeeze() - print(static_da) reshaped_tensor = ( torch.tensor(static_da.to_numpy(), dtype=torch.float32) From a95eb5a66f3d6fbf9afc992beea24fd8a307f1d3 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 13 Nov 2024 16:16:34 +0100 Subject: [PATCH 6/6] fix config-path arg bug in CLIs --- neural_lam/create_graph.py | 4 +++- neural_lam/train_model.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/neural_lam/create_graph.py b/neural_lam/create_graph.py index 854818b..ef979be 100644 --- a/neural_lam/create_graph.py +++ b/neural_lam/create_graph.py @@ -590,7 +590,9 @@ def cli(input_args=None): ) args = parser.parse_args(input_args) - assert args.config is not None, "Specify your config with --config_path" + assert ( + args.config_path is not None + ), "Specify your config with --config_path" # Load neural-lam configuration and datastore to use _, datastore = load_config_and_datastore(config_path=args.config_path) diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py index 8f400b3..183c230 100644 --- a/neural_lam/train_model.py +++ b/neural_lam/train_model.py @@ -209,7 +209,9 @@ def main(input_args=None): } # Asserts for arguments - assert args.config is not None, "Specify your config with --config_path" + assert ( + args.config_path is not None + ), "Specify your config with --config_path" assert args.model in MODELS, f"Unknown model: {args.model}" assert args.eval in ( None,