From d355ef5030c80faa50e4f440cddee052597dd53b Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 12 Nov 2024 21:00:01 +0100
Subject: [PATCH 1/6] bugfix for earlier unstacking dim order fix in datastores

---
 neural_lam/datastore/base.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index 25e7d01..418b3b3 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -466,19 +466,27 @@ def stack_grid_coords(
         da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS)
         # find the feature dimension, which has named with the format
         # `{category}_feature`
+
+        # ensure that grid_index is the first dimension, and the feature
+        # dimension is the second
+        dim_order = ["grid_index"]
+
         potential_feature_dims = [
-            d for d in da_or_ds_stacked.dims if d.endswith("_feature")
+            d for d in da_or_ds_stacked.dims if d.endswith("feature")
         ]
-        if not len(potential_feature_dims) == 1:
+        n_feature_dims = len(potential_feature_dims)
+        if n_feature_dims == 0:
+            pass
+        elif n_feature_dims == 1:
+            feature_dim = potential_feature_dims[0]
+            dim_order.append(feature_dim)
+        else:
             raise ValueError(
                 "Expected exactly one feature dimension in the stacked data, "
                 f"got {potential_feature_dims}"
             )
-        feature_dim = potential_feature_dims[0]
 
-        # ensure that grid_index is the first dimension, and the feature
-        # dimension is the second
-        return da_or_ds_stacked.transpose("grid_index", feature_dim, ...)
+        return da_or_ds_stacked.transpose(*dim_order, ...)
 
     @property
     @functools.lru_cache

From 1121d9f12aeb34ab5e1d32c87d008201c9ec0754 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 13 Nov 2024 13:36:04 +0100
Subject: [PATCH 2/6] add enforcement of datastores output dimension order

---
 neural_lam/datastore/base.py               | 72 +++++++++++++++-------
 neural_lam/datastore/mdp.py                |  8 +--
 neural_lam/datastore/npyfilesmeps/store.py |  8 ++-
 tests/test_datastores.py                   |  2 +-
 4 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index 418b3b3..8784ff0 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -329,6 +329,39 @@ def state_feature_weights_values(self) -> List[float]:
         """
         pass
 
+    @functools.lru_cache
+    def expected_dim_order(self, category: str) -> List[str]:
+        """
+        Return the expected dimension order for the dataarray or dataset
+        returned by `get_dataarray` for the given category of data. The
+        dimension order is the order of the dimensions in the dataarray or
+        dataset, and is used to check that the data is in the expected format.
+
+        This is necessary so that when stacking and unstacking the spatial grid
+        we can ensure that the dimension order is the same as what is returned
+        from `get_dataarray`. And also ensures that downstream uses of a
+        datastore (e.g. WeatherDataset) sees the data in a common structure.
+
+        Parameters
+        ----------
+        category : str
+            The category of the dataset (state/forcing/static).
+
+        Returns
+        -------
+        List[str]
+            The expected dimension order for the dataarray or dataset.
+
+        """
+        dim_order = ["grid_index", f"{category}_feature"]
+        if self.is_forecast:
+            dim_order.extend(["analysis_time", "elapsed_forecast_duration"])
+        elif not self.is_forecast:
+            dim_order.append("time")
+        if self.is_ensemble:
+            dim_order.append("ensemble_member")
+        return dim_order
+
 
 @dataclasses.dataclass
 class CartesianGridShape:
@@ -464,29 +497,22 @@ def stack_grid_coords(
             return da_or_ds
 
         da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS)
-        # find the feature dimension, which has named with the format
-        # `{category}_feature`
-
-        # ensure that grid_index is the first dimension, and the feature
-        # dimension is the second
-        dim_order = ["grid_index"]
-
-        potential_feature_dims = [
-            d for d in da_or_ds_stacked.dims if d.endswith("feature")
-        ]
-        n_feature_dims = len(potential_feature_dims)
-        if n_feature_dims == 0:
-            pass
-        elif n_feature_dims == 1:
-            feature_dim = potential_feature_dims[0]
-            dim_order.append(feature_dim)
-        else:
-            raise ValueError(
-                "Expected exactly one feature dimension in the stacked data, "
-                f"got {potential_feature_dims}"
-            )
-
-        return da_or_ds_stacked.transpose(*dim_order, ...)
+
+        # infer what category of data by finding the dimension named in the
+        # format `{category}_feature`
+        category = None
+        for dim in da_or_ds_stacked.dims:
+            if dim.endswith("_feature"):
+                if category is not None:
+                    raise ValueError(
+                        "Multiple dimensions ending with '_feature' found in "
+                        f"dataarray: {da_or_ds_stacked}. Cannot infer category."
+                    )
+                category = dim.split("_")[0]
+
+        dim_order = self.expected_dim_order(category=category)
+
+        return da_or_ds_stacked.transpose(dim_order)
 
     @property
     @functools.lru_cache
diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index 2e438af..f698ddf 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -266,9 +266,7 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray:
         # set multi-index for grid-index
         da_category = da_category.set_index(grid_index=self.CARTESIAN_COORDS)
 
-        if "time" not in da_category.dims:
-            return da_category
-        else:
+        if "time" in da_category.dims:
             t_start = (
                 self._ds.splits.sel(split_name=split)
                 .sel(split_part="start")
@@ -281,7 +279,9 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray:
                 .load()
                 .item()
             )
-            return da_category.sel(time=slice(t_start, t_end))
+            da_category = da_category.sel(time=slice(t_start, t_end))
+
+        return da_category.transpose(self.expected_dim_order(category=category))
 
     def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """
diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
index ffa70dc..37693ea 100644
--- a/neural_lam/datastore/npyfilesmeps/store.py
+++ b/neural_lam/datastore/npyfilesmeps/store.py
@@ -299,6 +299,8 @@ def get_dataarray(self, category: str, split: str) -> DataArray:
                 f"Expected features {expected_features}, got {actual_features}"
             )
 
+        da = da.transpose(self.expected_dim_order(category=category))
+
         return da
 
     def _get_single_timeseries_dataarray(
@@ -346,7 +348,11 @@ def _get_single_timeseries_dataarray(
                 None,
             ), "Unknown dataset split"
         else:
-            assert split in ("train", "val", "test"), "Unknown dataset split"
+            assert split in (
+                "train",
+                "val",
+                "test",
+            ), f"Unknown dataset split {split} for features {features}"
 
         if member is not None and features != self.get_vars_names(
             category="state"
diff --git a/tests/test_datastores.py b/tests/test_datastores.py
index c28418c..096efcb 100644
--- a/tests/test_datastores.py
+++ b/tests/test_datastores.py
@@ -319,7 +319,7 @@ def test_stacking_grid_coords(datastore_name, category):
     if not isinstance(datastore, BaseRegularGridDatastore):
         pytest.skip("Datastore does not implement `BaseCartesianDatastore`")
 
-    da_static = datastore.get_dataarray(category=category, split=None)
+    da_static = datastore.get_dataarray(category=category, split="train")
 
     da_static_unstacked = datastore.unstack_grid_coords(da_static).load()
     da_static_test = datastore.stack_grid_coords(da_static_unstacked)

From 9afaf6e9a73c2cc814064814b62e58532653d27b Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 13 Nov 2024 14:14:24 +0100
Subject: [PATCH 3/6] fix bugs introduced with dimension order during
 stack/unstack

---
 neural_lam/datastore/base.py               | 36 +++++++++++++++-------
 neural_lam/datastore/mdp.py                |  3 +-
 neural_lam/datastore/npyfilesmeps/store.py | 13 ++++----
 tests/test_config.py                       |  2 +-
 4 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
index 8784ff0..f24d7b3 100644
--- a/neural_lam/datastore/base.py
+++ b/neural_lam/datastore/base.py
@@ -330,7 +330,7 @@ def state_feature_weights_values(self) -> List[float]:
         pass
 
     @functools.lru_cache
-    def expected_dim_order(self, category: str) -> List[str]:
+    def expected_dim_order(self, category: str = None) -> List[str]:
         """
         Return the expected dimension order for the dataarray or dataset
         returned by `get_dataarray` for the given category of data. The
@@ -342,6 +342,9 @@ def expected_dim_order(self, category: str) -> List[str]:
         from `get_dataarray`. And also ensures that downstream uses of a
         datastore (e.g. WeatherDataset) sees the data in a common structure.
 
+        If the category is None, then the it assumed that data only represents
+        a 1D scalar field varying with grid-index.
+
         Parameters
         ----------
         category : str
@@ -353,13 +356,24 @@ def expected_dim_order(self, category: str) -> List[str]:
             The expected dimension order for the dataarray or dataset.
 
         """
-        dim_order = ["grid_index", f"{category}_feature"]
-        if self.is_forecast:
-            dim_order.extend(["analysis_time", "elapsed_forecast_duration"])
-        elif not self.is_forecast:
-            dim_order.append("time")
-        if self.is_ensemble:
-            dim_order.append("ensemble_member")
+        dim_order = ["grid_index"]
+
+        if category is not None:
+            dim_order.append(f"{category}_feature")
+
+            if category != "static":
+                # static data does not vary in time
+                if self.is_forecast:
+                    dim_order.extend(
+                        ["analysis_time", "elapsed_forecast_duration"]
+                    )
+                elif not self.is_forecast:
+                    dim_order.append("time")
+
+            if self.is_ensemble and category == "state":
+                # XXX: for now we only assume ensemble data for state variables
+                dim_order.append("ensemble_member")
+
         return dim_order
 
 
@@ -498,8 +512,8 @@ def stack_grid_coords(
 
         da_or_ds_stacked = da_or_ds.stack(grid_index=self.CARTESIAN_COORDS)
 
-        # infer what category of data by finding the dimension named in the
-        # format `{category}_feature`
+        # infer what category of data the array represents by finding the
+        # dimension named in the format `{category}_feature`
         category = None
         for dim in da_or_ds_stacked.dims:
             if dim.endswith("_feature"):
@@ -512,7 +526,7 @@ def stack_grid_coords(
 
         dim_order = self.expected_dim_order(category=category)
 
-        return da_or_ds_stacked.transpose(dim_order)
+        return da_or_ds_stacked.transpose(*dim_order)
 
     @property
     @functools.lru_cache
diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index f698ddf..df50771 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -281,7 +281,8 @@ def get_dataarray(self, category: str, split: str) -> xr.DataArray:
             )
             da_category = da_category.sel(time=slice(t_start, t_end))
 
-        return da_category.transpose(self.expected_dim_order(category=category))
+        dim_order = self.expected_dim_order(category=category)
+        return da_category.transpose(*dim_order)
 
     def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """
diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
index 37693ea..5fd2d63 100644
--- a/neural_lam/datastore/npyfilesmeps/store.py
+++ b/neural_lam/datastore/npyfilesmeps/store.py
@@ -282,15 +282,16 @@ def get_dataarray(self, category: str, split: str) -> DataArray:
                     features=features, split=split
                 )
                 das.append(da)
-            da = xr.concat(das, dim="feature").transpose(
-                "grid_index", "feature"
-            )
+            da = xr.concat(das, dim="feature")
 
         else:
             raise NotImplementedError(category)
 
         da = da.rename(dict(feature=f"{category}_feature"))
 
+        # stack the [x, y] dimensions into a `grid_index` dimension
+        da = self.stack_grid_coords(da)
+
         # check that we have the right features
         actual_features = da[f"{category}_feature"].values.tolist()
         expected_features = self.get_vars_names(category=category)
@@ -299,7 +300,8 @@ def get_dataarray(self, category: str, split: str) -> DataArray:
                 f"Expected features {expected_features}, got {actual_features}"
             )
 
-        da = da.transpose(self.expected_dim_order(category=category))
+        dim_order = self.expected_dim_order(category=category)
+        da = da.transpose(*dim_order)
 
         return da
 
@@ -501,9 +503,6 @@ def _get_single_timeseries_dataarray(
 
         da = xr.DataArray(arr_all, dims=dims, coords=coords)
 
-        # stack the [x, y] dimensions into a `grid_index` dimension
-        da = self.stack_grid_coords(da)
-
         return da
 
     def _get_analysis_times(self, split) -> List[np.datetime64]:
diff --git a/tests/test_config.py b/tests/test_config.py
index 4bb7c1c..1ff40bc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -46,7 +46,7 @@ def test_config_serialization(state_weighting_config):
 training:
   state_feature_weighting:
     __config_class__: ManualStateFeatureWeighting
-    values:
+    weights:
       u100m: 1.0
       v100m: 1.0
 """

From 3df627f269f3c34b2958a56c549eb7b382f32183 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 13 Nov 2024 16:09:04 +0100
Subject: [PATCH 4/6] update meps test to point to new dataset on aws

---
 tests/conftest.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3dfac91..658f4c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,11 +23,8 @@
 
 # Initializing variables for the s3 client
 S3_BUCKET_NAME = "mllam-testdata"
-# S3_ENDPOINT_URL = "https://object-store.os-api.cci1.ecmwf.int"
-S3_ENDPOINT_URL = "http://localhost:8000"
-# S3_FILE_PATH = "neural-lam/npy/meps_example_reduced.v0.1.0.zip"
-# TODO: I will upload this to AWS S3 once I have located the credentials...
-S3_FILE_PATH = "meps_example_reduced.v0.2.0.zip"
+S3_ENDPOINT_URL = "https://object-store.os-api.cci1.ecmwf.int"
+S3_FILE_PATH = "neural-lam/npy/meps_example_reduced.v0.2.0.zip"
 S3_FULL_PATH = "/".join([S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_FILE_PATH])
 TEST_DATA_KNOWN_HASH = (
     "7ff2e07e04cfcd77631115f800c9d49188bb2a7c2a2777da3cea219f926d0c86"

From 89fac82420cbb894bbd00719b09055b1043c8b23 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 13 Nov 2024 16:09:31 +0100
Subject: [PATCH 5/6] remove unused print statement

---
 tests/test_datastores.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_datastores.py b/tests/test_datastores.py
index 096efcb..4a4b110 100644
--- a/tests/test_datastores.py
+++ b/tests/test_datastores.py
@@ -339,7 +339,6 @@ def test_dataarray_shapes(datastore_name):
     unstacked_tensor = torch.tensor(
         datastore.unstack_grid_coords(static_da).to_numpy(), dtype=torch.float32
     ).squeeze()
-    print(static_da)
 
     reshaped_tensor = (
         torch.tensor(static_da.to_numpy(), dtype=torch.float32)

From a95eb5a66f3d6fbf9afc992beea24fd8a307f1d3 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 13 Nov 2024 16:16:34 +0100
Subject: [PATCH 6/6] fix config-path arg bug in CLIs

---
 neural_lam/create_graph.py | 4 +++-
 neural_lam/train_model.py  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/neural_lam/create_graph.py b/neural_lam/create_graph.py
index 854818b..ef979be 100644
--- a/neural_lam/create_graph.py
+++ b/neural_lam/create_graph.py
@@ -590,7 +590,9 @@ def cli(input_args=None):
     )
     args = parser.parse_args(input_args)
 
-    assert args.config is not None, "Specify your config with --config_path"
+    assert (
+        args.config_path is not None
+    ), "Specify your config with --config_path"
 
     # Load neural-lam configuration and datastore to use
     _, datastore = load_config_and_datastore(config_path=args.config_path)
diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py
index 8f400b3..183c230 100644
--- a/neural_lam/train_model.py
+++ b/neural_lam/train_model.py
@@ -209,7 +209,9 @@ def main(input_args=None):
     }
 
     # Asserts for arguments
-    assert args.config is not None, "Specify your config with --config_path"
+    assert (
+        args.config_path is not None
+    ), "Specify your config with --config_path"
     assert args.model in MODELS, f"Unknown model: {args.model}"
     assert args.eval in (
         None,