updates to ecahm2020 notebook

pangeo-data · May 14, 2020 · e99484f · e99484f
1 parent 331658b
commit e99484f
Show file tree

Hide file tree

Showing 10 changed files with 482 additions and 695 deletions.
diff --git a/.binder/environment.yml b/.binder/environment.yml
@@ -0,0 +1,14 @@
+name: skdownscale
+channels:
+  - conda-forge
+dependencies:
+  - pandas
+  - xarray
+  - seaborn
+  - probscale
+  - matplotlib
+  - scipy
+  - numpy
+  - sklearn
+  - dask
+  - distributed
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,2 +1,2 @@
 [settings]
-known_third_party = click,numpy,pandas,pkg_resources,pytest,scipy,setuptools,sklearn,xarray,xsd
+known_third_party = click,matplotlib,numpy,pandas,pkg_resources,probscale,pytest,scipy,seaborn,setuptools,sklearn,xarray,xsd
diff --git a/data/downscale_test_data.zarr.zip b/data/downscale_test_data.zarr.zip
diff --git a/examples/2020ECAHM-scikit-downscale.ipynb b/examples/2020ECAHM-scikit-downscale.ipynb
diff --git a/examples/utils.py b/examples/utils.py
@@ -0,0 +1,85 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import probscale
+import seaborn as sns
+import xarray as xr
+
+
+def get_sample_data(kind):
+
+    if kind == 'training':
+        data = xr.open_zarr('../data/downscale_test_data.zarr.zip', group=kind)
+        # extract 1 point of training data for precipitation and temperature
+        df = (
+            data.isel(point=0)
+            .to_dataframe()[['T2max', 'PREC_TOT']]
+            .rename(columns={'T2max': 'tmax', 'PREC_TOT': 'pcp'})
+        )
+        df['tmax'] -= 273.13
+        df['pcp'] *= 24
+        return df.resample('1d').first()
+    elif kind == 'targets':
+        data = xr.open_zarr('../data/downscale_test_data.zarr.zip', group=kind)
+        # extract 1 point of training data for precipitation and temperature
+        return (
+            data.isel(point=0)
+            .to_dataframe()[['Tmax', 'Prec']]
+            .rename(columns={'Tmax': 'tmax', 'Prec': 'pcp'})
+        )
+    elif kind == 'wind-hist':
+        return (
+            xr.open_dataset(
+                '../data/uas/uas.hist.CanESM2.CRCM5-UQAM.day.NAM-44i.raw.Colorado.19801990.nc'
+            )['uas']
+            .sel(lat=40.25, lon=-109.2, method='nearest')
+            .squeeze()
+            .to_dataframe()[['uas']]
+        )
+    elif kind == 'wind-obs':
+        return (
+            xr.open_dataset('../data/uas/uas.gridMET.NAM-44i.Colorado.19801990.nc')['uas']
+            .sel(lat=40.25, lon=-109.2, method='nearest')
+            .squeeze()
+            .to_dataframe()[['uas']]
+        )
+    elif kind == 'wind-rcp':
+        return (
+            xr.open_dataset(
+                '../data/uas/uas.rcp85.CanESM2.CRCM5-UQAM.day.NAM-44i.raw.Colorado.19902000.nc'
+            )['uas']
+            .sel(lat=40.25, lon=-109.2, method='nearest')
+            .squeeze()
+            .to_dataframe()[['uas']]
+        )
+    else:
+        raise ValueError(kind)
+
+    return df
+
+
+def prob_plots(x, y, y_hat, shape=(2, 2), figsize=(8, 8)):
+
+    fig, axes = plt.subplots(*shape, sharex=True, sharey=True, figsize=figsize)
+
+    scatter_kws = dict(label='', marker=None, linestyle='-')
+    common_opts = dict(plottype='qq', problabel='', datalabel='')
+
+    for ax, (label, series) in zip(axes.flat, y_hat.items()):
+
+        scatter_kws['label'] = 'original'
+        fig = probscale.probplot(x, ax=ax, scatter_kws=scatter_kws, **common_opts)
+
+        scatter_kws['label'] = 'target'
+        fig = probscale.probplot(y, ax=ax, scatter_kws=scatter_kws, **common_opts)
+
+        scatter_kws['label'] = 'corrected'
+        fig = probscale.probplot(series, ax=ax, scatter_kws=scatter_kws, **common_opts)
+        ax.set_title(label)
+        ax.legend()
+
+    [ax.set_xlabel('Standard Normal Quantiles') for ax in axes[-1]]
+    [ax.set_ylabel('Temperature [C]') for ax in axes[:, 0]]
+    [fig.delaxes(ax) for ax in axes.flat[len(y_hat.keys()) :]]
+    fig.tight_layout()
+
+    return fig
diff --git a/skdownscale/pointwise_models/base.py b/skdownscale/pointwise_models/base.py
@@ -0,0 +1,6 @@
+from sklearn.base import RegressorMixin
+from sklearn.linear_model.base import LinearModel
+
+
+class AbstractDownscaler(LinearModel, RegressorMixin):
+    pass
diff --git a/skdownscale/pointwise_models/bcsd.py b/skdownscale/pointwise_models/bcsd.py
@@ -6,25 +6,27 @@
 from sklearn.linear_model.base import LinearModel
 from sklearn.utils.validation import check_is_fitted
 
+from .base import AbstractDownscaler
 from .utils import QuantileMapper, ensure_samples_features
 
 
 def MONTH_GROUPER(x):
     return x.month
 
 
-class BcsdBase(LinearModel, RegressorMixin):
+class BcsdBase(AbstractDownscaler):
     """ Base class for BCSD model.
     """
 
-    _fit_attributes = ["y_climo_", "quantile_mappers_"]
+    _fit_attributes = ['y_climo_', 'quantile_mappers_']
 
-    def __init__(self, time_grouper=MONTH_GROUPER, **qm_kwargs):
+    def __init__(self, time_grouper=MONTH_GROUPER, return_anoms=True, **qm_kwargs):
         if isinstance(time_grouper, str):
             self.time_grouper = pd.Grouper(freq=time_grouper)
         else:
             self.time_grouper = time_grouper
 
+        self.return_anoms = return_anoms
         self.qm_kwargs = qm_kwargs
 
     def _qm_fit_by_group(self, groups):
@@ -88,7 +90,7 @@ def fit(self, X, y):
         # calculate the climatologies
         self.y_climo_ = y_groups.mean()
         if self.y_climo_.values.min() <= 0:
-            raise ValueError("Invalid value in target climatology")
+            raise ValueError('Invalid value in target climatology')
 
         # fit the quantile mappers
         self._qm_fit_by_group(y_groups)
@@ -116,7 +118,10 @@ def predict(self, X):
         Xqm = self._qm_transform_by_group(X.groupby(self.time_grouper))
 
         # calculate the anomalies as a ratio of the training data
-        return self._calc_ratio_anoms(Xqm, self.y_climo_)
+        if self.return_anoms:
+            return self._calc_ratio_anoms(Xqm, self.y_climo_)
+        else:
+            return Xqm
 
     def _calc_ratio_anoms(self, obj, climatology):
         dfs = []
@@ -190,7 +195,10 @@ def rolling_func(x):
         # restore the shift
         X_qm_with_shift = X_shift + Xqm
         # calculate the anomalies
-        return self._remove_climatology(X_qm_with_shift, self.y_climo_)
+        if self.return_anoms:
+            return self._remove_climatology(X_qm_with_shift, self.y_climo_)
+        else:
+            return X_qm_with_shift
 
     def _remove_climatology(self, obj, climatology):
         dfs = []

diff --git a/skdownscale/pointwise_models/gard.py b/skdownscale/pointwise_models/gard.py
@@ -1,15 +1,14 @@
 import numpy as np
 from scipy.spatial import cKDTree
-from sklearn.base import RegressorMixin
 from sklearn.linear_model import LinearRegression
-from sklearn.linear_model.base import LinearModel
 from sklearn.utils.validation import check_is_fitted
 
+from .base import AbstractDownscaler
 from .utils import ensure_samples_features
 
 
-class AnalogBase(LinearModel, RegressorMixin):
-    _fit_attributes = ["kdtree_", "y_"]
+class AnalogBase(AbstractDownscaler):
+    _fit_attributes = ['kdtree_', 'y_']
 
     def fit(self, X, y):
         """ Fit Analog model using a KDTree
@@ -124,7 +123,7 @@ class PureAnalog(AnalogBase):
     def __init__(
         self,
         n_analogs=200,
-        kind="best_analog",
+        kind='best_analog',
         thresh=None,
         stats=True,
         kdtree_kwargs={},
@@ -135,9 +134,9 @@ def __init__(
         self.kdtree_kwargs = kdtree_kwargs
         self.query_kwargs = query_kwargs
 
-        if kind == "best_analog" or n_analogs == 1:
+        if kind == 'best_analog' or n_analogs == 1:
             self.n_analogs = 1
-            self.kind = "best_analog"
+            self.kind = 'best_analog'
         else:
             self.n_analogs = n_analogs
             self.kind = kind
@@ -169,16 +168,16 @@ def predict(self, X):
             analog_mask = analogs > self.thresh
             masked_analogs = analogs[analog_mask]
 
-        if self.kind == "best_analog":
+        if self.kind == 'best_analog':
             predicted = analogs
 
-        elif self.kind == "sample_analogs":
+        elif self.kind == 'sample_analogs':
             # get 1 random index to sample from the analogs
             rand_inds = np.random.randint(low=0, high=self.n_analogs, size=len(X))
             # select the analog now
             predicted = select_analogs(analogs, rand_inds)
 
-        elif self.kind == "weight_analogs":
+        elif self.kind == 'weight_analogs':
             # take weighted average
             # work around for zero distances (perfect matches)
             tiny = 1e-20
@@ -188,14 +187,14 @@ def predict(self, X):
             else:
                 predicted = np.average(analogs.squeeze(), weights=weights, axis=1)
 
-        elif self.kind == "mean_analogs":
+        elif self.kind == 'mean_analogs':
             if self.thresh is not None:
                 predicted = masked_analogs.mean(axis=1)
             else:
                 predicted = analogs.mean(axis=1)
 
         else:
-            raise ValueError("got unexpected kind %s" % self.kind)
+            raise ValueError('got unexpected kind %s' % self.kind)
 
         if self.thresh is not None:
             # for mean/weight cases, this fills nans when all analogs
@@ -205,11 +204,11 @@ def predict(self, X):
         if self.stats:
             # calculate the standard deviation of the anlogs
             if self.thresh is None:
-                self.stats_["error"] = analogs.std(axis=1)
+                self.stats_['error'] = analogs.std(axis=1)
             else:
-                self.stats_["error"] = analogs.where(analog_mask).std(axis=1)
+                self.stats_['error'] = analogs.where(analog_mask).std(axis=1)
                 # calculate the probability of precip
-                self.stats_["pop"] = np.where(analog_mask, 1, 0).mean(axis=1)
+                self.stats_['pop'] = np.where(analog_mask, 1, 0).mean(axis=1)
 
         return predicted
 

diff --git a/skdownscale/pointwise_models/zscore.py b/skdownscale/pointwise_models/zscore.py
@@ -1,12 +1,12 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
-from sklearn.base import RegressorMixin
-from sklearn.linear_model.base import LinearModel
 from sklearn.utils.validation import check_is_fitted
 
+from .base import AbstractDownscaler
 
-class ZScoreRegressor(LinearModel, RegressorMixin):
+
+class ZScoreRegressor(AbstractDownscaler):
     """ Z Score Regressor bias correction model wrapper
 
     Apply a scikit-learn model (e.g. Pipeline) point-by-point. The pipeline
@@ -19,7 +19,7 @@ class ZScoreRegressor(LinearModel, RegressorMixin):
         days.
     """
 
-    _fit_attributes = ["shift_", "scale_"]
+    _fit_attributes = ['shift_', 'scale_']
 
     def __init__(self, window_width=31):
 
@@ -47,15 +47,13 @@ def fit(self, X, y):
 
         X_mean, X_std = _calc_stats(X.squeeze(), self.window_width)
         y_mean, y_std = _calc_stats(y.squeeze(), self.window_width)
-        stats_dict = {
-            "X_mean": X_mean,
-            "X_std": X_std,
-            "y_mean": y_mean,
-            "y_std": y_std,
+        self.stats_dict_ = {
+            'X_mean': X_mean,
+            'X_std': X_std,
+            'y_mean': y_mean,
+            'y_std': y_std,
         }
 
-        self.stats_dict_ = stats_dict
-
         shift, scale = _get_params(X_mean, X_std, y_mean, y_std)
 
         self.shift_ = shift
@@ -90,16 +88,16 @@ def predict(self, X):
             fut_mean, fut_std, shift_expanded, scale_expanded
         )
 
-        stats_dict = {
-            "meani": fut_mean,
-            "stdi": fut_std,
-            "meanf": fut_mean_corrected,
-            "stdf": fut_std_corrected,
+        self.fut_stats_dict_ = {
+            'meani': fut_mean,
+            'stdi': fut_std,
+            'meanf': fut_mean_corrected,
+            'stdf': fut_std_corrected,
         }
 
         fut_corrected = (fut_zscore * fut_std_corrected) + fut_mean_corrected
 
-        return fut_corrected.to_frame(name), stats_dict
+        return fut_corrected.to_frame(name)
 
 
 def _reshape(da, window_width):
@@ -123,19 +121,19 @@ def _reshape(da, window_width):
 
     assert da.ndim == 1
 
-    if "time" not in da.coords and "index" in da.coords:
-        da = da.rename({"index": "time"})
-    assert "time" in da.coords
+    if 'time' not in da.coords and 'index' in da.coords:
+        da = da.rename({'index': 'time'})
+    assert 'time' in da.coords
 
     def split(g):
-        return g.rename({"time": "day"}).assign_coords(day=g.time.dt.dayofyear.values)
+        return g.rename({'time': 'day'}).assign_coords(day=g.time.dt.dayofyear.values)
 
-    da_split = da.groupby("time.year").map(split)
+    da_split = da.groupby('time.year').map(split)
 
     early_jans = da_split.isel(day=slice(None, window_width // 2))
     late_decs = da_split.isel(day=slice(-window_width // 2, None))
 
-    da_rsh = xr.concat([late_decs, da_split, early_jans], dim="day")
+    da_rsh = xr.concat([late_decs, da_split, early_jans], dim='day')
     return da_rsh
 
 
@@ -162,11 +160,11 @@ def _calc_stats(series, window_width):
     da = series.to_xarray()
     da_rsh = _reshape(da, window_width)
 
-    ds_rolled = da_rsh.rolling(day=window_width, center=True).construct("win_day")
+    ds_rolled = da_rsh.rolling(day=window_width, center=True).construct('win_day')
 
     n = window_width // 2 + 1
-    ds_mean = ds_rolled.mean(dim=["year", "win_day"]).isel(day=slice(n, -n))
-    ds_std = ds_rolled.std(dim=["year", "win_day"]).isel(day=slice(n, -n))
+    ds_mean = ds_rolled.mean(dim=['year', 'win_day']).isel(day=slice(n, -n))
+    ds_std = ds_rolled.std(dim=['year', 'win_day']).isel(day=slice(n, -n))
 
     mean = ds_mean.to_series()
     std = ds_std.to_series()