Skip to content

Commit

Permalink
updates to ecahm2020 notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Joseph Hamman committed May 14, 2020
1 parent 331658b commit e99484f
Show file tree
Hide file tree
Showing 10 changed files with 482 additions and 695 deletions.
14 changes: 14 additions & 0 deletions .binder/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: skdownscale
channels:
- conda-forge
dependencies:
- pandas
- xarray
- seaborn
- probscale
- matplotlib
- scipy
- numpy
- sklearn
- dask
- distributed
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[settings]
known_third_party = click,numpy,pandas,pkg_resources,pytest,scipy,setuptools,sklearn,xarray,xsd
known_third_party = click,matplotlib,numpy,pandas,pkg_resources,probscale,pytest,scipy,seaborn,setuptools,sklearn,xarray,xsd
Binary file modified data/downscale_test_data.zarr.zip
Binary file not shown.
925 changes: 301 additions & 624 deletions examples/2020ECAHM-scikit-downscale.ipynb

Large diffs are not rendered by default.

85 changes: 85 additions & 0 deletions examples/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import matplotlib.pyplot as plt
import pandas as pd
import probscale
import seaborn as sns
import xarray as xr


def get_sample_data(kind):

if kind == 'training':
data = xr.open_zarr('../data/downscale_test_data.zarr.zip', group=kind)
# extract 1 point of training data for precipitation and temperature
df = (
data.isel(point=0)
.to_dataframe()[['T2max', 'PREC_TOT']]
.rename(columns={'T2max': 'tmax', 'PREC_TOT': 'pcp'})
)
df['tmax'] -= 273.13
df['pcp'] *= 24
return df.resample('1d').first()
elif kind == 'targets':
data = xr.open_zarr('../data/downscale_test_data.zarr.zip', group=kind)
# extract 1 point of training data for precipitation and temperature
return (
data.isel(point=0)
.to_dataframe()[['Tmax', 'Prec']]
.rename(columns={'Tmax': 'tmax', 'Prec': 'pcp'})
)
elif kind == 'wind-hist':
return (
xr.open_dataset(
'../data/uas/uas.hist.CanESM2.CRCM5-UQAM.day.NAM-44i.raw.Colorado.19801990.nc'
)['uas']
.sel(lat=40.25, lon=-109.2, method='nearest')
.squeeze()
.to_dataframe()[['uas']]
)
elif kind == 'wind-obs':
return (
xr.open_dataset('../data/uas/uas.gridMET.NAM-44i.Colorado.19801990.nc')['uas']
.sel(lat=40.25, lon=-109.2, method='nearest')
.squeeze()
.to_dataframe()[['uas']]
)
elif kind == 'wind-rcp':
return (
xr.open_dataset(
'../data/uas/uas.rcp85.CanESM2.CRCM5-UQAM.day.NAM-44i.raw.Colorado.19902000.nc'
)['uas']
.sel(lat=40.25, lon=-109.2, method='nearest')
.squeeze()
.to_dataframe()[['uas']]
)
else:
raise ValueError(kind)

return df


def prob_plots(x, y, y_hat, shape=(2, 2), figsize=(8, 8)):

fig, axes = plt.subplots(*shape, sharex=True, sharey=True, figsize=figsize)

scatter_kws = dict(label='', marker=None, linestyle='-')
common_opts = dict(plottype='qq', problabel='', datalabel='')

for ax, (label, series) in zip(axes.flat, y_hat.items()):

scatter_kws['label'] = 'original'
fig = probscale.probplot(x, ax=ax, scatter_kws=scatter_kws, **common_opts)

scatter_kws['label'] = 'target'
fig = probscale.probplot(y, ax=ax, scatter_kws=scatter_kws, **common_opts)

scatter_kws['label'] = 'corrected'
fig = probscale.probplot(series, ax=ax, scatter_kws=scatter_kws, **common_opts)
ax.set_title(label)
ax.legend()

[ax.set_xlabel('Standard Normal Quantiles') for ax in axes[-1]]
[ax.set_ylabel('Temperature [C]') for ax in axes[:, 0]]
[fig.delaxes(ax) for ax in axes.flat[len(y_hat.keys()) :]]
fig.tight_layout()

return fig
6 changes: 6 additions & 0 deletions skdownscale/pointwise_models/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from sklearn.base import RegressorMixin
from sklearn.linear_model.base import LinearModel


class AbstractDownscaler(LinearModel, RegressorMixin):
pass
20 changes: 14 additions & 6 deletions skdownscale/pointwise_models/bcsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,27 @@
from sklearn.linear_model.base import LinearModel
from sklearn.utils.validation import check_is_fitted

from .base import AbstractDownscaler
from .utils import QuantileMapper, ensure_samples_features


def MONTH_GROUPER(x):
return x.month


class BcsdBase(LinearModel, RegressorMixin):
class BcsdBase(AbstractDownscaler):
""" Base class for BCSD model.
"""

_fit_attributes = ["y_climo_", "quantile_mappers_"]
_fit_attributes = ['y_climo_', 'quantile_mappers_']

def __init__(self, time_grouper=MONTH_GROUPER, **qm_kwargs):
def __init__(self, time_grouper=MONTH_GROUPER, return_anoms=True, **qm_kwargs):
if isinstance(time_grouper, str):
self.time_grouper = pd.Grouper(freq=time_grouper)
else:
self.time_grouper = time_grouper

self.return_anoms = return_anoms
self.qm_kwargs = qm_kwargs

def _qm_fit_by_group(self, groups):
Expand Down Expand Up @@ -88,7 +90,7 @@ def fit(self, X, y):
# calculate the climatologies
self.y_climo_ = y_groups.mean()
if self.y_climo_.values.min() <= 0:
raise ValueError("Invalid value in target climatology")
raise ValueError('Invalid value in target climatology')

# fit the quantile mappers
self._qm_fit_by_group(y_groups)
Expand Down Expand Up @@ -116,7 +118,10 @@ def predict(self, X):
Xqm = self._qm_transform_by_group(X.groupby(self.time_grouper))

# calculate the anomalies as a ratio of the training data
return self._calc_ratio_anoms(Xqm, self.y_climo_)
if self.return_anoms:
return self._calc_ratio_anoms(Xqm, self.y_climo_)
else:
return Xqm

def _calc_ratio_anoms(self, obj, climatology):
dfs = []
Expand Down Expand Up @@ -190,7 +195,10 @@ def rolling_func(x):
# restore the shift
X_qm_with_shift = X_shift + Xqm
# calculate the anomalies
return self._remove_climatology(X_qm_with_shift, self.y_climo_)
if self.return_anoms:
return self._remove_climatology(X_qm_with_shift, self.y_climo_)
else:
return X_qm_with_shift

def _remove_climatology(self, obj, climatology):
dfs = []
Expand Down
29 changes: 14 additions & 15 deletions skdownscale/pointwise_models/gard.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import numpy as np
from scipy.spatial import cKDTree
from sklearn.base import RegressorMixin
from sklearn.linear_model import LinearRegression
from sklearn.linear_model.base import LinearModel
from sklearn.utils.validation import check_is_fitted

from .base import AbstractDownscaler
from .utils import ensure_samples_features


class AnalogBase(LinearModel, RegressorMixin):
_fit_attributes = ["kdtree_", "y_"]
class AnalogBase(AbstractDownscaler):
_fit_attributes = ['kdtree_', 'y_']

def fit(self, X, y):
""" Fit Analog model using a KDTree
Expand Down Expand Up @@ -124,7 +123,7 @@ class PureAnalog(AnalogBase):
def __init__(
self,
n_analogs=200,
kind="best_analog",
kind='best_analog',
thresh=None,
stats=True,
kdtree_kwargs={},
Expand All @@ -135,9 +134,9 @@ def __init__(
self.kdtree_kwargs = kdtree_kwargs
self.query_kwargs = query_kwargs

if kind == "best_analog" or n_analogs == 1:
if kind == 'best_analog' or n_analogs == 1:
self.n_analogs = 1
self.kind = "best_analog"
self.kind = 'best_analog'
else:
self.n_analogs = n_analogs
self.kind = kind
Expand Down Expand Up @@ -169,16 +168,16 @@ def predict(self, X):
analog_mask = analogs > self.thresh
masked_analogs = analogs[analog_mask]

if self.kind == "best_analog":
if self.kind == 'best_analog':
predicted = analogs

elif self.kind == "sample_analogs":
elif self.kind == 'sample_analogs':
# get 1 random index to sample from the analogs
rand_inds = np.random.randint(low=0, high=self.n_analogs, size=len(X))
# select the analog now
predicted = select_analogs(analogs, rand_inds)

elif self.kind == "weight_analogs":
elif self.kind == 'weight_analogs':
# take weighted average
# work around for zero distances (perfect matches)
tiny = 1e-20
Expand All @@ -188,14 +187,14 @@ def predict(self, X):
else:
predicted = np.average(analogs.squeeze(), weights=weights, axis=1)

elif self.kind == "mean_analogs":
elif self.kind == 'mean_analogs':
if self.thresh is not None:
predicted = masked_analogs.mean(axis=1)
else:
predicted = analogs.mean(axis=1)

else:
raise ValueError("got unexpected kind %s" % self.kind)
raise ValueError('got unexpected kind %s' % self.kind)

if self.thresh is not None:
# for mean/weight cases, this fills nans when all analogs
Expand All @@ -205,11 +204,11 @@ def predict(self, X):
if self.stats:
# calculate the standard deviation of the anlogs
if self.thresh is None:
self.stats_["error"] = analogs.std(axis=1)
self.stats_['error'] = analogs.std(axis=1)
else:
self.stats_["error"] = analogs.where(analog_mask).std(axis=1)
self.stats_['error'] = analogs.where(analog_mask).std(axis=1)
# calculate the probability of precip
self.stats_["pop"] = np.where(analog_mask, 1, 0).mean(axis=1)
self.stats_['pop'] = np.where(analog_mask, 1, 0).mean(axis=1)

return predicted

Expand Down
50 changes: 24 additions & 26 deletions skdownscale/pointwise_models/zscore.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import numpy as np
import pandas as pd
import xarray as xr
from sklearn.base import RegressorMixin
from sklearn.linear_model.base import LinearModel
from sklearn.utils.validation import check_is_fitted

from .base import AbstractDownscaler

class ZScoreRegressor(LinearModel, RegressorMixin):

class ZScoreRegressor(AbstractDownscaler):
""" Z Score Regressor bias correction model wrapper
Apply a scikit-learn model (e.g. Pipeline) point-by-point. The pipeline
Expand All @@ -19,7 +19,7 @@ class ZScoreRegressor(LinearModel, RegressorMixin):
days.
"""

_fit_attributes = ["shift_", "scale_"]
_fit_attributes = ['shift_', 'scale_']

def __init__(self, window_width=31):

Expand Down Expand Up @@ -47,15 +47,13 @@ def fit(self, X, y):

X_mean, X_std = _calc_stats(X.squeeze(), self.window_width)
y_mean, y_std = _calc_stats(y.squeeze(), self.window_width)
stats_dict = {
"X_mean": X_mean,
"X_std": X_std,
"y_mean": y_mean,
"y_std": y_std,
self.stats_dict_ = {
'X_mean': X_mean,
'X_std': X_std,
'y_mean': y_mean,
'y_std': y_std,
}

self.stats_dict_ = stats_dict

shift, scale = _get_params(X_mean, X_std, y_mean, y_std)

self.shift_ = shift
Expand Down Expand Up @@ -90,16 +88,16 @@ def predict(self, X):
fut_mean, fut_std, shift_expanded, scale_expanded
)

stats_dict = {
"meani": fut_mean,
"stdi": fut_std,
"meanf": fut_mean_corrected,
"stdf": fut_std_corrected,
self.fut_stats_dict_ = {
'meani': fut_mean,
'stdi': fut_std,
'meanf': fut_mean_corrected,
'stdf': fut_std_corrected,
}

fut_corrected = (fut_zscore * fut_std_corrected) + fut_mean_corrected

return fut_corrected.to_frame(name), stats_dict
return fut_corrected.to_frame(name)


def _reshape(da, window_width):
Expand All @@ -123,19 +121,19 @@ def _reshape(da, window_width):

assert da.ndim == 1

if "time" not in da.coords and "index" in da.coords:
da = da.rename({"index": "time"})
assert "time" in da.coords
if 'time' not in da.coords and 'index' in da.coords:
da = da.rename({'index': 'time'})
assert 'time' in da.coords

def split(g):
return g.rename({"time": "day"}).assign_coords(day=g.time.dt.dayofyear.values)
return g.rename({'time': 'day'}).assign_coords(day=g.time.dt.dayofyear.values)

da_split = da.groupby("time.year").map(split)
da_split = da.groupby('time.year').map(split)

early_jans = da_split.isel(day=slice(None, window_width // 2))
late_decs = da_split.isel(day=slice(-window_width // 2, None))

da_rsh = xr.concat([late_decs, da_split, early_jans], dim="day")
da_rsh = xr.concat([late_decs, da_split, early_jans], dim='day')
return da_rsh


Expand All @@ -162,11 +160,11 @@ def _calc_stats(series, window_width):
da = series.to_xarray()
da_rsh = _reshape(da, window_width)

ds_rolled = da_rsh.rolling(day=window_width, center=True).construct("win_day")
ds_rolled = da_rsh.rolling(day=window_width, center=True).construct('win_day')

n = window_width // 2 + 1
ds_mean = ds_rolled.mean(dim=["year", "win_day"]).isel(day=slice(n, -n))
ds_std = ds_rolled.std(dim=["year", "win_day"]).isel(day=slice(n, -n))
ds_mean = ds_rolled.mean(dim=['year', 'win_day']).isel(day=slice(n, -n))
ds_std = ds_rolled.std(dim=['year', 'win_day']).isel(day=slice(n, -n))

mean = ds_mean.to_series()
std = ds_std.to_series()
Expand Down
Loading

0 comments on commit e99484f

Please sign in to comment.