-
Notifications
You must be signed in to change notification settings - Fork 44
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Metrics for temporal subgroups (#266)
* Update environment * First implementation of flexible set metrics * Fix keyword for metrics calculation when reference dataset must be included * Update tests * Update CHANGELOG.rst * Update CHANGELOG.rst * Update env * Remove unnecessary checks for data availability * Undo * Fix Test * Make bootstrapping settings better accessible when using the validation framework * Renamed GenericDatetime to YearlessDatetime and moved to grouping module * Update notebook to include subset metrics and reader adapters * Update tests * Change yearless date name * Fix tests
- Loading branch information
Showing
6 changed files
with
980 additions
and
187 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,22 +26,23 @@ | |
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | ||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
# Author: Christoph Paulik [email protected] | ||
# Creation date: 2014-06-30 | ||
|
||
|
||
""" | ||
Module provides grouping functions that can be used together with pandas | ||
to create a few strange timegroupings like e.g. decadal products were | ||
there are three products per month with timestamps on the 10th 20th and last | ||
of the month | ||
""" | ||
from dataclasses import dataclass | ||
from typing import Optional, Union, Tuple, List | ||
|
||
import pandas as pd | ||
import numpy as np | ||
from datetime import date | ||
from datetime import date, datetime | ||
import calendar | ||
|
||
from cadati.conv_doy import doy | ||
|
||
|
||
def group_by_day_bin(df, bins=[1, 11, 21, 32], start=False, | ||
dtindex=None): | ||
|
@@ -153,3 +154,239 @@ def grouped_dates_between(start_date, end_date, bins=[1, 11, 21, 32], start=Fals | |
tstamps = grp.sum().index.to_pydatetime().tolist() | ||
|
||
return tstamps | ||
|
||
|
||
@dataclass | ||
class YearlessDatetime: | ||
""" | ||
Container class to store Datetime information without a year. This is | ||
used to group data when the year is not relevant (e.g. seasonal analysis). | ||
Only down to second. Used by | ||
:class:`pytesmo.validation_framework.metric_calculators_adapters.TsDistributor` | ||
""" | ||
month: int | ||
|
||
day: int = 1 | ||
hour: int = 0 | ||
minute: int = 0 | ||
second: int = 0 | ||
|
||
@property | ||
def __ly(self): | ||
return 2400 # arbitrary leap year | ||
|
||
def __ge__(self, other: 'YearlessDatetime'): | ||
return self.to_datetime(self.__ly) >= other.to_datetime(self.__ly) | ||
|
||
def __le__(self, other: 'YearlessDatetime'): | ||
return self.to_datetime(self.__ly) <= other.to_datetime(self.__ly) | ||
|
||
def __lt__(self, other: 'YearlessDatetime'): | ||
return self.to_datetime(self.__ly) < other.to_datetime(self.__ly) | ||
|
||
def __gt__(self, other: 'YearlessDatetime'): | ||
return self.to_datetime(self.__ly) > other.to_datetime(self.__ly) | ||
|
||
def __repr__(self): | ||
return f"****-{self.month:02}-{self.day:02}" \ | ||
f"T{self.hour:02}:{self.minute:02}:{self.second:02}" | ||
|
||
@property | ||
def doy(self) -> int: | ||
""" | ||
Get day of year for this date. Assume leap year! | ||
i.e.: 1=Jan.1st, 366=Dec.31st, 60=Feb.29th. | ||
""" | ||
return doy(self.month, self.day, year=None) | ||
|
||
@classmethod | ||
def from_datetime(cls, dt: datetime): | ||
""" | ||
Omit year from passed datetime to create generic datetime. | ||
""" | ||
return cls(dt.month, dt.day, dt.hour, dt.minute, dt.second) | ||
|
||
def to_datetime(self, years: Optional[Union[Tuple[int, ...], int]]) \ | ||
-> Union[datetime, List, None]: | ||
""" | ||
Convert generic datetime to datetime with year. | ||
Feb 29th for non-leap-years will return None | ||
""" | ||
dt = [] | ||
|
||
for year in np.atleast_1d(years): | ||
if not calendar.isleap(year) and self.doy == 60.: | ||
continue | ||
else: | ||
d = datetime(year, self.month, self.day, self.hour, | ||
self.minute, self.second) | ||
dt.append(d) | ||
|
||
if len(dt) == 1: | ||
return dt[0] | ||
elif len(dt) == 0: | ||
return None | ||
else: | ||
return dt | ||
|
||
|
||
class TsDistributor: | ||
|
||
def __init__(self, | ||
dates=None, | ||
date_ranges=None, | ||
yearless_dates=None, | ||
yearless_date_ranges=None): | ||
""" | ||
Build a data distibutor from individual dates, date ranges, generic | ||
dates (without specific year) and generic date ranges. | ||
Components: | ||
- individual datetime objects for distinct dates | ||
- generic datetime objects for dates without specific a year | ||
- date range / datetime tuple | ||
i.e. ALL datetimes between the 2 passed dates (start, end) | ||
the start date must be earlier than the end date | ||
- generic date range / generic datetime tuple | ||
i.e. ALL datetimes between 2 generic dates (for any year) | ||
Parameters | ||
---------- | ||
dates : Tuple[datetime, ...] or Tuple[str, ...] or pd.DatetimeIndex | ||
Individual dates (that also have a year assigned). | ||
date_ranges: Tuple[Tuple[datetime, datetime], ...] | ||
A list of date ranges, consisting of a start and end date for each | ||
range. The start date must be earlier in time than the end date. | ||
yearless_dates: Tuple[YearlessDatetime,...] or Tuple[datetime...] | ||
A list of generic dates (that apply to any year). | ||
Can be passed as a list of | ||
- YearlessDatetime objects | ||
e.g. YearlessDatetime(5,31,12,1,10), ie. May 31st 12:01:10 | ||
- pydatetime objects (years will be ignored, duplicates dropped) | ||
yearless_date_ranges: [Tuple[YearlessDatetime, YearlessDatetime], ...] | ||
A list of generic date ranges (that apply to any year). | ||
""" | ||
|
||
self.dates = dates | ||
self.date_ranges = date_ranges | ||
self.yearless_dates = yearless_dates | ||
self.yearless_date_ranges = yearless_date_ranges | ||
|
||
def __repr__(self): | ||
s = [] | ||
for var in ['dates', 'date_ranges', 'yearless_dates', | ||
'yearless_date_ranges']: | ||
val = getattr(self, var) | ||
s.append(f"#{var}={len(val) if val is not None else 0}") | ||
|
||
return f"{self.__class__.__name__}({', '.join(s)})" | ||
|
||
def select(self, | ||
df: Union[pd.DataFrame, pd.Series, pd.DatetimeIndex], | ||
set_nan=False): | ||
""" | ||
Select rows from data frame or series with mathing date time indices. | ||
Parameters | ||
---------- | ||
df: pd.DataFrame or pd.Series | ||
Must have a date time index, which is then filtered based on the | ||
dates. | ||
set_nan: bool, optional (default: False) | ||
Instead of dropping rows that are not selected, set their values to | ||
nan. | ||
Returns | ||
------- | ||
df: pd.DataFrame or pd.Series | ||
The filterd input data | ||
""" | ||
if isinstance(df, pd.DatetimeIndex): | ||
idx = df | ||
else: | ||
idx = df.index | ||
|
||
if not isinstance(idx, pd.DatetimeIndex): | ||
raise ValueError(f"Expected a DatetimeIndex, " | ||
f"but got {type(df.index)}.") | ||
|
||
mask = self.filter(idx) | ||
|
||
if set_nan: | ||
df[~mask] = np.nan | ||
return df | ||
else: | ||
return df[mask] | ||
|
||
def filter(self, idx: pd.DatetimeIndex): | ||
""" | ||
Filter datetime index for a TimeSeriesDistributionSet | ||
Parameters | ||
---------- | ||
idx: pd.DatetimeIndex | ||
Datetime index to split using the set | ||
Returns | ||
------- | ||
idx_filtered: pd.DatetimeIndex | ||
Filtered Index that contains dates for the set | ||
""" | ||
|
||
mask = pd.DataFrame(index=idx.copy()) | ||
|
||
if self.dates is not None: | ||
_idx_dates = idx.intersection(pd.DatetimeIndex(self.dates)) | ||
mask['dates'] = False | ||
mask.loc[_idx_dates, 'dates'] = True | ||
|
||
if self.date_ranges is not None: | ||
for i, drange in enumerate(self.date_ranges): | ||
start, end = drange[0], drange[1] | ||
if start > end: | ||
start, end = end, start | ||
mask[f"range{i}"] = (idx >= start) & (idx <= end) | ||
|
||
if self.yearless_dates is not None: | ||
arrs = np.array([]) | ||
for d in self.yearless_dates: | ||
dts = d.to_datetime(np.unique(idx.year)) | ||
if dts is None: | ||
continue | ||
else: | ||
arrs = np.append(arrs, dts) | ||
_idx_dates = idx.intersection(pd.DatetimeIndex(arrs)) | ||
mask['gen_dates'] = False | ||
mask.loc[_idx_dates, 'gen_dates'] = True | ||
|
||
# avoid loop like: | ||
# cond = ["__index_month == {}".format(m) for m in months] | ||
# selection = dat.query(" | ".join(cond)).index | ||
|
||
if self.yearless_date_ranges is not None: | ||
for i, gdrange in enumerate(self.yearless_date_ranges): | ||
for y in np.unique(idx.year): | ||
|
||
if not calendar.isleap(y) and (gdrange[0].doy == 60): | ||
start = YearlessDatetime(3, 1) | ||
else: | ||
start = gdrange[0] | ||
|
||
if (not calendar.isleap(y)) and (gdrange[1].doy == 60): | ||
end = YearlessDatetime(2, 28, 23, 59, 59) | ||
else: | ||
end = gdrange[1] | ||
|
||
start_dt = start.to_datetime(years=y) | ||
|
||
if end < start: | ||
end_dt = end.to_datetime(years=y + 1) | ||
else: | ||
end_dt = end.to_datetime(years=y) | ||
|
||
mask[f"gen_range{y}-{i}"] = (idx >= start_dt) & ( | ||
idx <= end_dt) | ||
|
||
return mask.any(axis=1, bool_only=True) |
Oops, something went wrong.