remove all interior_mask and boundary_mask

mllam · joeloskarsson · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
commit 1af1481e6884f89ccf39befa37e0d61ed16bbcc3
diff --git a/neural_lam/datastore/base.py b/neural_lam/datastore/base.py
@@ -228,23 +228,6 @@ def get_dataarray(
         """
         pass
 
-    @cached_property
-    @abc.abstractmethod
-    def boundary_mask(self) -> xr.DataArray:
-        """
-        Return the boundary mask for the dataset, with spatial dimensions
-        stacked. Where the value is 1, the grid point is a boundary point, and
-        where the value is 0, the grid point is not a boundary point.
-
-        Returns
-        -------
-        xr.DataArray
-            The boundary mask for the dataset, with dimensions
-            `('grid_index',)`.
-
-        """
-        pass
-
     @abc.abstractmethod
     def get_xy(self, category: str) -> np.ndarray:
         """

diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
@@ -318,40 +318,6 @@ def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         ds_stats = self._ds[stats_variables.keys()].rename(stats_variables)
         return ds_stats
 
-    @cached_property
-    def boundary_mask(self) -> xr.DataArray:
-        """
-        Produce a 0/1 mask for the boundary points of the dataset, these will
-        sit at the edges of the domain (in x/y extent) and will be used to mask
-        out the boundary points from the loss function and to overwrite the
-        boundary points from the prediction. For now this is created when the
-        mask is requested, but in the future this could be saved to the zarr
-        file.
-
-        Returns
-        -------
-        xr.DataArray
-            A 0/1 mask for the boundary points of the dataset, where 1 is a
-            boundary point and 0 is not.
-
-        """
-        if self._n_boundary_points > 0:
-            ds_unstacked = self.unstack_grid_coords(da_or_ds=self._ds)
-            da_state_variable = (
-                ds_unstacked["state"].isel(time=0).isel(state_feature=0)
-            )
-            da_domain_allzero = xr.zeros_like(da_state_variable)
-            ds_unstacked["boundary_mask"] = da_domain_allzero.isel(
-                x=slice(self._n_boundary_points, -self._n_boundary_points),
-                y=slice(self._n_boundary_points, -self._n_boundary_points),
-            )
-            ds_unstacked["boundary_mask"] = ds_unstacked.boundary_mask.fillna(
-                1
-            ).astype(int)
-            return self.stack_grid_coords(da_or_ds=ds_unstacked.boundary_mask)
-        else:
-            return None
-
     @property
     def coords_projection(self) -> ccrs.Projection:
         """

diff --git a/neural_lam/datastore/npyfilesmeps/store.py b/neural_lam/datastore/npyfilesmeps/store.py
@@ -668,34 +668,6 @@ def grid_shape_state(self) -> CartesianGridShape:
         ny, nx = self.config.grid_shape_state
         return CartesianGridShape(x=nx, y=ny)
 
-    @cached_property
-    def boundary_mask(self) -> xr.DataArray:
-        """The boundary mask for the dataset. This is a binary mask that is 1
-        where the grid cell is on the boundary of the domain, and 0 otherwise.
-
-        Returns
-        -------
-        xr.DataArray
-            The boundary mask for the dataset, with dimensions `[grid_index]`.
-
-        """
-        xy = self.get_xy(category="state", stacked=False)
-        xs = xy[:, :, 0]
-        ys = xy[:, :, 1]
-        # Check if x-coordinates are constant along columns
-        assert np.allclose(xs, xs[:, [0]]), "x-coordinates are not constant"
-        # Check if y-coordinates are constant along rows
-        assert np.allclose(ys, ys[[0], :]), "y-coordinates are not constant"
-        # Extract unique x and y coordinates
-        x = xs[:, 0]  # Unique x-coordinates (changes along the first axis)
-        y = ys[0, :]  # Unique y-coordinates (changes along the second axis)
-        values = np.load(self.root_path / "static" / "border_mask.npy")
-        da_mask = xr.DataArray(
-            values, dims=["y", "x"], coords=dict(x=x, y=y), name="boundary_mask"
-        )
-        da_mask_stacked_xy = self.stack_grid_coords(da_mask).astype(int)
-        return da_mask_stacked_xy
-
     def get_standardization_dataarray(self, category: str) -> xr.Dataset:
         """Return the standardization dataarray for the given category. This
         should contain a `{category}_mean` and `{category}_std` variable for

diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py
@@ -42,7 +42,6 @@ def __init__(
         da_state_stats = datastore.get_standardization_dataarray(
             category="state"
         )
-        da_boundary_mask = datastore.boundary_mask
         num_past_forcing_steps = args.num_past_forcing_steps
         num_future_forcing_steps = args.num_future_forcing_steps
 
@@ -115,18 +114,6 @@ def __init__(
         # Instantiate loss function
         self.loss = metrics.get_metric(args.loss)
 
-        boundary_mask = torch.tensor(
-            da_boundary_mask.values, dtype=torch.float32
-        ).unsqueeze(
-            1
-        )  # add feature dim
-
-        self.register_buffer("boundary_mask", boundary_mask, persistent=False)
-        # Pre-compute interior mask for use in loss function
-        self.register_buffer(
-            "interior_mask", 1.0 - self.boundary_mask, persistent=False
-        )  # (num_grid_nodes, 1), 1 for non-border
-
         self.val_metrics = {
             "mse": [],
         }
@@ -153,13 +140,6 @@ def configure_optimizers(self):
         )
         return opt
 
-    @property
-    def interior_mask_bool(self):
-        """
-        Get the interior mask as a boolean (N,) mask.
-        """
-        return self.interior_mask[:, 0].to(torch.bool)
-
     @staticmethod
     def expand_to_batch(x, batch_size):
         """
@@ -191,27 +171,20 @@ def unroll_prediction(self, init_states, forcing_features, true_states):
 
         for i in range(pred_steps):
             forcing = forcing_features[:, i]
-            border_state = true_states[:, i]
 
             pred_state, pred_std = self.predict_step(
                 prev_state, prev_prev_state, forcing
             )
             # state: (B, num_grid_nodes, d_f) pred_std: (B, num_grid_nodes,
             # d_f) or None
 
-            # Overwrite border with true state
-            new_state = (
-                self.boundary_mask * border_state
-                + self.interior_mask * pred_state
-            )
-
-            prediction_list.append(new_state)
+            prediction_list.append(pred_state)
             if self.output_std:
                 pred_std_list.append(pred_std)
 
             # Update conditioning states
             prev_prev_state = prev_state
-            prev_state = new_state
+            prev_state = pred_state
 
         prediction = torch.stack(
             prediction_list, dim=1
@@ -249,12 +222,14 @@ def training_step(self, batch):
         """
         prediction, target, pred_std, _ = self.common_step(batch)
 
-        # Compute loss
+        # Compute loss - mean over unrolled times and batch
         batch_loss = torch.mean(
             self.loss(
-                prediction, target, pred_std, mask=self.interior_mask_bool
+                prediction,
+                target,
+                pred_std,
             )
-        )  # mean over unrolled times and batch
+        )
 
         log_dict = {"train_loss": batch_loss}
         self.log_dict(
@@ -287,9 +262,7 @@ def validation_step(self, batch, batch_idx):
         prediction, target, pred_std, _ = self.common_step(batch)
 
         time_step_loss = torch.mean(
-            self.loss(
-                prediction, target, pred_std, mask=self.interior_mask_bool
-            ),
+            self.loss(prediction, target, pred_std),
             dim=0,
         )  # (time_steps-1)
         mean_loss = torch.mean(time_step_loss)
@@ -314,7 +287,6 @@ def validation_step(self, batch, batch_idx):
             prediction,
             target,
             pred_std,
-            mask=self.interior_mask_bool,
             sum_vars=False,
         )  # (B, pred_steps, d_f)
         self.val_metrics["mse"].append(entry_mses)
@@ -341,9 +313,7 @@ def test_step(self, batch, batch_idx):
         # pred_steps, num_grid_nodes, d_f) or (d_f,)
 
         time_step_loss = torch.mean(
-            self.loss(
-                prediction, target, pred_std, mask=self.interior_mask_bool
-            ),
+            self.loss(prediction, target, pred_std),
             dim=0,
         )  # (time_steps-1,)
         mean_loss = torch.mean(time_step_loss)
@@ -372,16 +342,13 @@ def test_step(self, batch, batch_idx):
                 prediction,
                 target,
                 pred_std,
-                mask=self.interior_mask_bool,
                 sum_vars=False,
             )  # (B, pred_steps, d_f)
             self.test_metrics[metric_name].append(batch_metric_vals)
 
         if self.output_std:
             # Store output std. per variable, spatially averaged
-            mean_pred_std = torch.mean(
-                pred_std[..., self.interior_mask_bool, :], dim=-2
-            )  # (B, pred_steps, d_f)
+            mean_pred_std = torch.mean(pred_std, dim=-2)  # (B, pred_steps, d_f)
             self.test_metrics["output_std"].append(mean_pred_std)
 
         # Save per-sample spatial loss for specific times

diff --git a/neural_lam/vis.py b/neural_lam/vis.py
@@ -86,13 +86,6 @@ def plot_prediction(
 
     extent = datastore.get_xy_extent("state")
 
-    # Set up masking of border region
-    da_mask = datastore.unstack_grid_coords(datastore.boundary_mask)
-    mask_reshaped = da_mask.values
-    pixel_alpha = (
-        mask_reshaped.clamp(0.7, 1).cpu().numpy()
-    )  # Faded border region
-
     fig, axes = plt.subplots(
         1,
         2,
@@ -112,7 +105,6 @@ def plot_prediction(
             data_grid,
             origin="lower",
             extent=extent,
-            alpha=pixel_alpha,
             vmin=vmin,
             vmax=vmax,
             cmap="plasma",
@@ -147,13 +139,6 @@ def plot_spatial_error(
 
     extent = datastore.get_xy_extent("state")
 
-    # Set up masking of border region
-    da_mask = datastore.unstack_grid_coords(datastore.boundary_mask)
-    mask_reshaped = da_mask.values
-    pixel_alpha = (
-        mask_reshaped.clamp(0.7, 1).cpu().numpy()
-    )  # Faded border region
-
     fig, ax = plt.subplots(
         figsize=(5, 4.8),
         subplot_kw={"projection": datastore.coords_projection},
@@ -170,7 +155,6 @@ def plot_spatial_error(
         error_grid,
         origin="lower",
         extent=extent,
-        alpha=pixel_alpha,
         vmin=vmin,
         vmax=vmax,
         cmap="OrRd",

diff --git a/tests/dummy_datastore.py b/tests/dummy_datastore.py
@@ -148,12 +148,6 @@ def __init__(
                 times = [self.T0 + dt * i for i in range(n_timesteps)]
                 self.ds.coords["time"] = times
 
-        # Add boundary mask
-        self.ds["boundary_mask"] = xr.DataArray(
-            np.random.choice([0, 1], size=(n_points_1d, n_points_1d)),
-            dims=["x", "y"],
-        )
-
         # Stack the spatial dimensions into grid_index
         self.ds = self.ds.stack(grid_index=self.CARTESIAN_COORDS)
 
@@ -342,22 +336,6 @@ def get_dataarray(
         dim_order = self.expected_dim_order(category=category)
         return self.ds[category].transpose(*dim_order)
 
-    @cached_property
-    def boundary_mask(self) -> xr.DataArray:
-        """
-        Return the boundary mask for the dataset, with spatial dimensions
-        stacked. Where the value is 1, the grid point is a boundary point, and
-        where the value is 0, the grid point is not a boundary point.
-
-        Returns
-        -------
-        xr.DataArray
-            The boundary mask for the dataset, with dimensions
-            `('grid_index',)`.
-
-        """
-        return self.ds["boundary_mask"]
-
     def get_xy(self, category: str, stacked: bool) -> ndarray:
         """Return the x, y coordinates of the dataset.
 

diff --git a/tests/test_datastores.py b/tests/test_datastores.py
@@ -18,8 +18,6 @@
       dataarray for the given category.
 - `get_dataarray` (method): Return the processed data (as a single
       `xr.DataArray`) for the given category and test/train/val-split.
-- `boundary_mask` (property): Return the boundary mask for the dataset,
-      with spatial dimensions stacked.
 - `config` (property): Return the configuration of the datastore.
 
 In addition BaseRegularGridDatastore must have the following methods and
@@ -213,25 +211,6 @@ def test_get_dataarray(datastore_name):
         assert n_features["train"] == n_features["val"] == n_features["test"]
 
 
-@pytest.mark.parametrize("datastore_name", DATASTORES.keys())
-def test_boundary_mask(datastore_name):
-    """Check that the `datastore.boundary_mask` property is implemented and
-    that the returned object is an xarray DataArray with the correct shape."""
-    datastore = init_datastore_example(datastore_name)
-    da_mask = datastore.boundary_mask
-
-    assert isinstance(da_mask, xr.DataArray)
-    assert set(da_mask.dims) == {"grid_index"}
-    assert da_mask.dtype == "int"
-    assert set(da_mask.values) == {0, 1}
-    assert da_mask.sum() > 0
-    assert da_mask.sum() < da_mask.size
-
-    if isinstance(datastore, BaseRegularGridDatastore):
-        grid_shape = datastore.grid_shape_state
-        assert datastore.boundary_mask.size == grid_shape.x * grid_shape.y
-
-
 @pytest.mark.parametrize("datastore_name", DATASTORES.keys())
 def test_get_xy_extent(datastore_name):
     """Check that the `datastore.get_xy_extent` method is implemented and that