From 569b87f7ac8955d08fa4f67ca5c59dc35909550f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 29 Aug 2024 09:41:54 -0400 Subject: [PATCH 1/3] fix standardize method and 'transform' -> 'normalization' --- xclim/sdba/_adjustment.py | 40 +++++++++++++++++++++------------------ xclim/sdba/adjustment.py | 28 +++++++++++++-------------- xclim/sdba/utils.py | 14 +++++++------- 3 files changed, 43 insertions(+), 39 deletions(-) diff --git a/xclim/sdba/_adjustment.py b/xclim/sdba/_adjustment.py index 8bc9dcd62..ee15e22b2 100644 --- a/xclim/sdba/_adjustment.py +++ b/xclim/sdba/_adjustment.py @@ -953,7 +953,7 @@ def _otc_adjust( bin_origin: dict | float | np.ndarray | None = None, num_iter_max: int | None = 100_000_000, jitter_inside_bins: bool = True, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", ): """Optimal Transport Correction of the bias of X with respect to Y. @@ -972,8 +972,9 @@ def _otc_adjust( jitter_inside_bins : bool If `False`, output points are located at the center of their bin. If `True`, a random location is picked uniformly inside their bin. Default is `True`. - transform : {None, 'standardize', 'max_distance', 'max_value'} - Per-variable transformation applied before the distances are calculated. + normalization : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated + in the optimal transport. Returns ------- @@ -1013,7 +1014,7 @@ def _otc_adjust( gridY, muY, _ = u.histogram(Y, bin_width, bin_origin) # Compute the optimal transportation plan - plan = u.optimal_transport(gridX, gridY, muX, muY, num_iter_max, transform) + plan = u.optimal_transport(gridX, gridY, muX, muY, num_iter_max, normalization) gridX = np.floor((gridX - bin_origin) / bin_width) gridY = np.floor((gridY - bin_origin) / bin_width) @@ -1051,7 +1052,7 @@ def otc_adjust( num_iter_max: int | None = 100_000_000, jitter_inside_bins: bool = True, adapt_freq_thresh: dict | None = None, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", ): """Optimal Transport Correction of the bias of `hist` with respect to `ref`. @@ -1076,8 +1077,9 @@ def otc_adjust( If `True`, a random location is picked uniformly inside their bin. Default is `True`. adapt_freq_thresh : dict, optional Threshold for frequency adaptation per variable. - transform : {None, 'standardize', 'max_distance', 'max_value'} - Per-variable transformation applied before the distances are calculated. + normalization : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated + in the optimal transport. Returns ------- @@ -1121,7 +1123,7 @@ def otc_adjust( bin_origin=bin_origin, num_iter_max=num_iter_max, jitter_inside_bins=jitter_inside_bins, - transform=transform, + normalization=normalization, ), input_core_dims=[["dim_hist", pts_dim], ["dim_ref", pts_dim]], output_core_dims=[["dim_hist", pts_dim]], @@ -1149,7 +1151,7 @@ def _dotc_adjust( cov_factor: str | None = "std", jitter_inside_bins: bool = True, kind: dict | None = None, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", ): """Dynamical Optimal Transport Correction of the bias of X with respect to Y. @@ -1175,8 +1177,9 @@ def _dotc_adjust( kind : dict, optional Keys are variable names and values are adjustment kinds, either additive or multiplicative. Unspecified dimensions are treated as "+". - transform : {None, 'standardize', 'max_distance', 'max_value'} - Per-variable transformation applied before the distances are calculated. + normalization : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated + in the optimal transport. Returns ------- @@ -1212,7 +1215,7 @@ def _dotc_adjust( bin_origin=bin_origin, num_iter_max=num_iter_max, jitter_inside_bins=False, - transform=transform, + normalization=normalization, ) # Map hist to sim @@ -1223,7 +1226,7 @@ def _dotc_adjust( bin_origin=bin_origin, num_iter_max=num_iter_max, jitter_inside_bins=False, - transform=transform, + normalization=normalization, ) # Temporal evolution @@ -1260,7 +1263,7 @@ def _dotc_adjust( bin_origin=bin_origin, num_iter_max=num_iter_max, jitter_inside_bins=jitter_inside_bins, - transform=transform, + normalization=normalization, ) return Z1 @@ -1278,7 +1281,7 @@ def dotc_adjust( jitter_inside_bins: bool = True, kind: dict | None = None, adapt_freq_thresh: dict | None = None, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", ): """Dynamical Optimal Transport Correction of the bias of X with respect to Y. @@ -1309,8 +1312,9 @@ def dotc_adjust( Unspecified dimensions are treated as "+". adapt_freq_thresh : dict, optional Threshold for frequency adaptation per variable. - transform : {None, 'standardize', 'max_distance', 'max_value'} - Per-variable transformation applied before the distances are calculated. + normalization : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated + in the optimal transport. Returns ------- @@ -1368,7 +1372,7 @@ def dotc_adjust( cov_factor=cov_factor, jitter_inside_bins=jitter_inside_bins, kind=kind, - transform=transform, + normalization=normalization, ), input_core_dims=[ ["dim_sim", pts_dim], diff --git a/xclim/sdba/adjustment.py b/xclim/sdba/adjustment.py index e8c4190b9..d70bd56e3 100644 --- a/xclim/sdba/adjustment.py +++ b/xclim/sdba/adjustment.py @@ -1306,7 +1306,7 @@ class OTC(Adjust): See :py:class:`xclim.sdba.processing.adapt_freq` for details. Frequency adaptation is not applied to missing variables if is dict. Applied to all variables if is string. - transform : {None, 'standardize', 'max_distance', 'max_value'} + normalization : {None, 'standardize', 'max_distance', 'max_value'} Per-variable transformation applied before the distances are calculated. Default is "max_distance". See notes for details. @@ -1337,12 +1337,12 @@ class OTC(Adjust): with probability :math:`P_{ij}`. A transformation of bin positions can be applied before computing the distances :math:`C_{ij}` to make variables on different scales more evenly taken into consideration by the optimization step. Available transformations are - - `transform = 'standardize'` : + - `normalization = 'standardize'` : .. math:: i_v' = \frac{i_v - mean(i_v)}{std(i_v)} \quad\quad\quad j_v' = \frac{j_v - mean(j_v)}{std(j_v)} - - `transform = 'max_distance'` : + - `normalization = 'max_distance'` : .. math:: i_v' = \frac{i_v}{max \{|i_v - j_v|\}} \quad\quad\quad j_v' = \frac{j_v}{max \{|i_v - j_v|\}} @@ -1352,7 +1352,7 @@ class OTC(Adjust): max \{|i_v' - j_v'|\} = max \{|i_w' - j_w'|\} = 1 - - `transform = 'max_value'` : + - `normalization = 'max_value'` : .. math:: i_v' = \frac{i_v}{max\{i_v\}} \quad\quad\quad j_v' = \frac{j_v}{max\{j_v\}} @@ -1388,7 +1388,7 @@ def _adjust( num_iter_max: int | None = 100_000_000, jitter_inside_bins: bool = True, adapt_freq_thresh: dict | str | None = None, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", group: str | Grouper = "time", pts_dim: str = "multivar", **kwargs, @@ -1398,7 +1398,7 @@ def _adjust( "POT is required for OTC and dOTC. Please install with `pip install POT`." ) - if transform not in [None, "standardize", "max_distance", "max_value"]: + if normalization not in [None, "standardize", "max_distance", "max_value"]: raise ValueError( "`transform` should be in [None, 'standardize', 'max_distance', 'max_value']." ) @@ -1423,7 +1423,7 @@ def _adjust( num_iter_max=num_iter_max, jitter_inside_bins=jitter_inside_bins, adapt_freq_thresh=adapt_freq_thresh, - transform=transform, + normalization=normalization, group=group, pts_dim=pts_dim, ).scen @@ -1479,9 +1479,9 @@ class dOTC(Adjust): See :py:class:`xclim.sdba.processing.adapt_freq` for details. Frequency adaptation is not applied to missing variables if is dict. Applied to all variables if is string. - transform : {None, 'standardize', 'max_distance', 'max_value'} - Per-variable transformation applied before the distances are calculated. - Default is "max_distance". + normalization : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated + in the optimal transport. Default is "max_distance". See :py:class:`~xclim.sdba.adjustment.OTC` for details. group : Union[str, Grouper] The grouping information. See :py:class:`xclim.sdba.base.Grouper` for details. @@ -1546,7 +1546,7 @@ def _adjust( jitter_inside_bins: bool = True, kind: dict | str | None = None, adapt_freq_thresh: dict | str | None = None, - transform: str | None = "max_distance", + normalization: str | None = "max_distance", group: str | Grouper = "time", pts_dim: str = "multivar", ) -> xr.DataArray: @@ -1565,9 +1565,9 @@ def _adjust( if cov_factor not in [None, "std", "cholesky"]: raise ValueError("`cov_factor` should be in [None, 'std', 'cholesky'].") - if transform not in [None, "standardize", "max_distance", "max_value"]: + if normalization not in [None, "standardize", "max_distance", "max_value"]: raise ValueError( - "`transform` should be in [None, 'standardize', 'max_distance', 'max_value']." + "`normalization` should be in [None, 'standardize', 'max_distance', 'max_value']." ) if isinstance(adapt_freq_thresh, str): @@ -1588,7 +1588,7 @@ def _adjust( jitter_inside_bins=jitter_inside_bins, kind=kind, adapt_freq_thresh=adapt_freq_thresh, - transform=transform, + normalization=normalization, group=group, pts_dim=pts_dim, ).scen diff --git a/xclim/sdba/utils.py b/xclim/sdba/utils.py index 1e805a6cb..fd23a1f6e 100644 --- a/xclim/sdba/utils.py +++ b/xclim/sdba/utils.py @@ -1018,7 +1018,7 @@ def histogram(data, bin_width, bin_origin): return grid, mu, idx_bin -def optimal_transport(gridX, gridY, muX, muY, numItermax, transform): +def optimal_transport(gridX, gridY, muX, muY, num_iter_max, normalization): """Computes the optimal transportation plan on (transformations of) X and Y. References @@ -1027,18 +1027,18 @@ def optimal_transport(gridX, gridY, muX, muY, numItermax, transform): """ from ot import emd - if transform == "standardize": - gridX = (gridX - gridX.mean()) / gridX.std() - gridY = (gridY - gridY.mean()) / gridY.std() + if normalization == "standardize": + gridX = (gridX - gridX.mean(axis=0)) / gridX.std(axis=0) + gridY = (gridY - gridY.mean(axis=0)) / gridY.std(axis=0) - elif transform == "max_distance": + elif normalization == "max_distance": max1 = np.abs(gridX.max(axis=0) - gridY.min(axis=0)) max2 = np.abs(gridY.max(axis=0) - gridX.min(axis=0)) max_dist = np.maximum(max1, max2) gridX = gridX / max_dist gridY = gridY / max_dist - elif transform == "max_value": + elif normalization == "max_value": max_value = np.maximum(gridX.max(axis=0), gridY.max(axis=0)) gridX = gridX / max_value gridY = gridY / max_value @@ -1047,7 +1047,7 @@ def optimal_transport(gridX, gridY, muX, muY, numItermax, transform): C = distance.cdist(gridX, gridY, "sqeuclidean") # Compute the optimal transportation plan - gamma = emd(muX, muY, C, numItermax=numItermax) + gamma = emd(muX, muY, C, numItermax=num_iter_max) plan = (gamma.T / gamma.sum(axis=1)).T return plan From 6b3c4c0b3a66e543fc305e039ceed4e86af3a8b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Thu, 29 Aug 2024 10:01:00 -0400 Subject: [PATCH 2/3] update CHANGELOG --- CHANGELOG.rst | 8 ++++++-- tests/test_sdba/test_adjustment.py | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b5930a36f..71969a524 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,11 +4,15 @@ Changelog v0.53.0 -------------------- -Contributors to this version: Adrien Lamarche (:user:`LamAdr`). +Contributors to this version: Adrien Lamarche (:user:`LamAdr`), Éric Dupuis (:user:`coxipi`). Bug fixes ^^^^^^^^^ -* Fixed a small inefficiency in ``_otc_adjust`` (:pull:`1890`). +* Fixed a small inefficiency in ``_otc_adjust``, and the `standardize` method of `OTC/dOTC` is now applied on individual variable (:pull:`1890`, :pull:`1896`). + +Breaking changes +^^^^^^^^^^^^^^^^ +* `transform` argument of `OTC/dOTC` classes (and child functions) is changed to `normalization`, and `numIterMax` is changed to `num_iter_max` in `utils.optimal_transport` (:pull:`1896`). v0.52.0 (2024-08-08) -------------------- diff --git a/tests/test_sdba/test_adjustment.py b/tests/test_sdba/test_adjustment.py index 6bdf292da..23289103f 100644 --- a/tests/test_sdba/test_adjustment.py +++ b/tests/test_sdba/test_adjustment.py @@ -898,9 +898,11 @@ def test_shape(self, random, series): assert not np.isin(hist.x[hist.x.isnull()].time.values, scen.time.values).any() +# TODO: Add tests for normalization methods class TestdOTC: @pytest.mark.parametrize("use_dask", [True, False]) @pytest.mark.parametrize("cov_factor", ["std", "cholesky"]) + # FIXME: Should this comparison not fail if `standardization` != `None`? def test_compare_sbck(self, random, series, use_dask, cov_factor): pytest.importorskip("ot") pytest.importorskip("SBCK", minversion="0.4.0") From 86be79a90f5f579c00098ba288b02c913c719fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric=20Dupuis?= Date: Wed, 4 Sep 2024 13:41:39 -0400 Subject: [PATCH 3/3] notations --- docs/notebooks/sdba.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/notebooks/sdba.ipynb b/docs/notebooks/sdba.ipynb index 72c7df816..037c58247 100644 --- a/docs/notebooks/sdba.ipynb +++ b/docs/notebooks/sdba.ipynb @@ -619,7 +619,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This last plot shows the correlation between input and output per variable. Here we see a relatively strong correlation for all variables, meaning they are all taken into account when finding the optimal transport mappings. This is because we're using the (by default) `transform = 'max_distance'` argument. Were the data not transformed, the distances along the precipitation dimension would be very small relative to the temperature distances. Precipitation values would then be spread around at very low cost and have virtually no effect on the result. See this in action with `transform = None`.\n", + "This last plot shows the correlation between input and output per variable. Here we see a relatively strong correlation for all variables, meaning they are all taken into account when finding the optimal transport mappings. This is because we're using the (by default) `normalization = 'max_distance'` argument. Were the data not normalized, the distances along the precipitation dimension would be very small relative to the temperature distances. Precipitation values would then be spread around at very low cost and have virtually no effect on the result. See this in action with `normalization = None`.\n", "\n", "The chunks we see in the tasmax data are artefacts of the `bin_width`." ] @@ -808,7 +808,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.12.4" }, "toc": { "base_numbering": 1,