From 3617649f97a4d43be53a38ef7462ca1bc5b5086e Mon Sep 17 00:00:00 2001 From: Wolfgang Preimesberger Date: Tue, 22 Aug 2023 10:02:01 +0200 Subject: [PATCH] Metrics for temporal subgroups (#266) * Update environment * First implementation of flexible set metrics * Fix keyword for metrics calculation when reference dataset must be included * Update tests * Update CHANGELOG.rst * Update CHANGELOG.rst * Update env * Remove unnecessary checks for data availability * Undo * Fix Test * Make bootstrapping settings better accessible when using the validation framework * Renamed GenericDatetime to YearlessDatetime and moved to grouping module * Update notebook to include subset metrics and reader adapters * Update tests * Change yearless date name * Fix tests --- docs/examples/validation_framework.ipynb | 546 +++++++++++++++--- src/pytesmo/time_series/grouping.py | 245 +++++++- src/pytesmo/validation_framework/adapters.py | 58 +- .../metric_calculators_adapters.py | 180 +++--- .../validation_framework/validation.py | 7 +- .../test_metric_calculators_adapters.py | 131 +++++ 6 files changed, 980 insertions(+), 187 deletions(-) create mode 100644 tests/test_validation_framework/test_metric_calculators_adapters.py diff --git a/docs/examples/validation_framework.ipynb b/docs/examples/validation_framework.ipynb index aebc5657..5b41390b 100644 --- a/docs/examples/validation_framework.ipynb +++ b/docs/examples/validation_framework.ipynb @@ -74,12 +74,15 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import numpy as np\n", "from pathlib import Path\n", - "\n", + "from pprint import pprint\n", "import pytesmo.validation_framework.metric_calculators as metrics_calculators\n", "from datetime import datetime\n", "import warnings\n", "from ascat.read_native.cdr import AscatGriddedNcTs\n", + "\n", "from ismn.interface import ISMN_Interface # install ismn: 'pip install ismn'\n", "from pytesmo.validation_framework.validation import Validation\n", "from pytesmo.validation_framework.results_manager import netcdf_results_manager\n", @@ -104,13 +107,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data is stored in: /tmp/tmprnczvqbv\n" + "Data is stored in: /tmp/tmppeisd__e\n" ] } ], "source": [ "from tempfile import mkdtemp\n", - "output_folder = mkdtemp()\n", + "output_folder = Path(mkdtemp())\n", "print('Data is stored in:', output_folder)" ] }, @@ -146,6 +149,7 @@ " grid_filename=ascat_grid_fname,\n", " static_layer_path=static_layer_path\n", " )\n", + "\n", "ascat_reader.read_bulk = True" ] }, @@ -165,7 +169,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing metadata for all ismn stations into folder /home/samuel/projects/QA4SM/pytesmo/tests/test-data/ismn/multinetwork/header_values.\n", + "Processing metadata for all ismn stations into folder /home/wpreimes/shares/home/code/pytesmo/tests/test-data/ismn/multinetwork/header_values.\n", "This may take a few minutes, but is only done once...\n", "Hint: Use `parallel=True` to speed up metadata generation for large datasets\n" ] @@ -174,7 +178,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Files Processed: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 226.95it/s]" + "Files Processed: 100%|██████████| 8/8 [00:00<00:00, 65.44it/s]" ] }, { @@ -182,8 +186,8 @@ "output_type": "stream", "text": [ "Metadata generation finished after 0 Seconds.\n", - "Metadata and Log stored in /tmp/tmpyw8mctbb\n", - "Found existing ismn metadata in /tmp/tmpyw8mctbb/header_values.csv.\n" + "Metadata and Log stored in /tmp/tmp_72gspsz\n", + "Found existing ismn metadata in /tmp/tmp_72gspsz/header_values.csv.\n" ] }, { @@ -212,11 +216,6 @@ "**DO NOT CHANGE** the name ***jobs*** because it will be searched during the parallel processing!" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "code", "execution_count": 5, @@ -332,26 +331,26 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/samuel/projects/QA4SM/pytesmo/src/pytesmo/validation_framework/validation.py:141: UserWarning: You are using the default temporal matcher. If you are using one of the newer metric calculators (PairwiseIntercomparisonMetrics, TripleCollocationMetrics) you should probably use `make_combined_temporal_matcher` instead. Have a look at the documentation of the metric calculators for more info.\n", + "/home/wpreimes/shares/home/code/pytesmo/src/pytesmo/validation_framework/validation.py:144: UserWarning: You are using the default temporal matcher. If you are using one of the newer metric calculators (PairwiseIntercomparisonMetrics, TripleCollocationMetrics) you should probably use `make_combined_temporal_matcher` instead. Have a look at the documentation of the metric calculators for more info.\n", " warnings.warn(\n" ] } ], "source": [ "period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]\n", - "basic_metrics = metrics_calculators.BasicMetrics(other_name='k1')\n", + "basic_metrics = metrics_calculators.PairwiseIntercomparisonMetrics()\n", "\n", "process = Validation(\n", " datasets, 'ISMN',\n", " temporal_ref='ASCAT',\n", - " scaling='cdf_match',\n", + " scaling='mean_std',\n", " scaling_ref='ASCAT',\n", " metrics_calculators={(2, 2): basic_metrics.calc_metrics},\n", " period=period)" @@ -375,7 +374,7 @@ "together and then combinations of two input datasets are given to one metric calculator while all three datasets\n", "are given to another metric calculator. This could look like this:\n", "\n", - "```python\n", + "```\n", "{ (3 ,2): metric_calc,\n", " (3, 3): triple_collocation}\n", "```\n", @@ -386,44 +385,52 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{(('ASCAT', 'sm'), ('ISMN', 'soil_moisture')): {'BIAS': array([0.18759985], dtype=float32),\n", - " 'R': array([0.47605154], dtype=float32),\n", - " 'RMSD': array([11.537956], dtype=float32),\n", + "{(('ASCAT', 'sm'), ('ISMN', 'soil_moisture')): {'BIAS': array([1.4210855e-14], dtype=float32),\n", + " 'BIAS_ci_lower': array([-1.124606], dtype=float32),\n", + " 'BIAS_ci_upper': array([1.124606], dtype=float32),\n", + " 'R': array([0.5342869], dtype=float32),\n", + " 'RMSD': array([10.789426], dtype=float32),\n", + " 'RSS': array([41558.98], dtype=float32),\n", + " 'R_ci_lower': array([0.45576647], dtype=float32),\n", + " 'R_ci_upper': array([0.60455596], dtype=float32),\n", " 'gpi': array([0], dtype=int32),\n", " 'lat': array([33.8833]),\n", " 'lon': array([102.1333]),\n", + " 'mse': array([116.41171], dtype=float32),\n", + " 'mse_bias': array([2.019484e-28], dtype=float32),\n", + " 'mse_corr': array([116.41171], dtype=float32),\n", + " 'mse_var': array([3.1554436e-30], dtype=float32),\n", " 'n_obs': array([357], dtype=int32),\n", - " 'p_R': array([1.3616014e-21], dtype=float32),\n", + " 'p_R': array([9.664754e-28], dtype=float32),\n", " 'p_rho': array([2.471651e-28], dtype=float32),\n", - " 'p_tau': array([nan], dtype=float32),\n", + " 'p_tau': array([2.1020434e-26], dtype=float32),\n", " 'rho': array([0.53934574], dtype=float32),\n", - " 'tau': array([nan], dtype=float32)}}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/samuel/projects/QA4SM/pytesmo/src/pytesmo/cdf_matching.py:174: UserWarning: The bins have been resized\n", - " warnings.warn(\"The bins have been resized\")\n" + " 'rho_ci_lower': array([0.45559874], dtype=float32),\n", + " 'rho_ci_upper': array([0.61362934], dtype=float32),\n", + " 'status': array([0], dtype=int32),\n", + " 'tau': array([0.3907124], dtype=float32),\n", + " 'tau_ci_lower': array([0.35201582], dtype=float32),\n", + " 'tau_ci_upper': array([0.42807564], dtype=float32),\n", + " 'urmsd': array([10.789426], dtype=float32),\n", + " 'urmsd_ci_lower': array([10.065898], dtype=float32),\n", + " 'urmsd_ci_upper': array([11.661137], dtype=float32)}}\n" ] } ], "source": [ - "save_path = output_folder\n", + "save_path = output_folder / 'ascat_ismn'\n", "\n", - "import pprint\n", "for job in jobs:\n", - " \n", + "\n", " results = process.calc(*job)\n", - " pprint.pprint(results)\n", + " pprint(results)\n", " netcdf_results_manager(results, save_path)" ] }, @@ -444,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -457,15 +464,32 @@ "_row_size [0]\n", "time []\n", "gpi [0]\n", - "n_obs [357]\n", - "R [0.47605154]\n", - "p_R [1.3616014e-21]\n", + "status [0]\n", + "R [0.5342869]\n", + "p_R [9.664754e-28]\n", + "BIAS [1.4210855e-14]\n", + "RMSD [10.789426]\n", + "mse [116.41171]\n", + "RSS [41558.98]\n", + "mse_corr [116.41171]\n", + "mse_bias [2.019484e-28]\n", + "urmsd [10.789426]\n", + "mse_var [3.1554436e-30]\n", "rho [0.53934574]\n", "p_rho [2.471651e-28]\n", - "RMSD [11.537956]\n", - "BIAS [0.18759985]\n", - "tau [nan]\n", - "p_tau [nan]\n" + "tau [0.3907124]\n", + "p_tau [2.1020434e-26]\n", + "BIAS_ci_lower [-1.124606]\n", + "BIAS_ci_upper [1.124606]\n", + "urmsd_ci_lower [10.065898]\n", + "urmsd_ci_upper [11.661137]\n", + "R_ci_lower [0.45576647]\n", + "R_ci_upper [0.60455596]\n", + "rho_ci_lower [0.45559874]\n", + "rho_ci_upper [0.61362934]\n", + "tau_ci_lower [0.35201582]\n", + "tau_ci_upper [0.42807564]\n", + "n_obs [357]\n" ] } ], @@ -478,6 +502,174 @@ " print(var, ds.variables[var][:])" ] }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Metric Calculator Adapters\n", + "\n", + "Metric calculators compute a set of comparison metrics based on the passed input. Usually the input are two or more collocated time series. However, to validate soil moisture it is often desired to assess the quality for certain temporal subsets independently. E.g. if a varying level of agreement between two datasets is expected for two different seasons (e.g. wet/dry season, summer/winter) or if the varying level of agreement should be assessed (e.g. on a seasonal/monthly basis). In this case pytesmo provides adapters to split up the input time series before computing validation metrics. These adapters work with any (properly implemented) metric calculator. You can use one of the predefined adapters in `pytesmo.validation_framework.metric_calculators_adapters`. Currently, there are 2 options:\n", + "\n", + "- `SubsetsMetricsAdapter`: This adapter lets you define arbitrary temporal subsets of your time series data before metrics computation. You can compute metrics for a certain set of datetimes or datetime ranges.\n", + " - `MonthsMetricsAdapter`: This is a version of a SubsetsMetricsAdapter that allows splitting up the data based on their month. You can e.g. compute validation metrics for each month of a year individually, or for each season (i.e. 3 * 4 months), or any other combination of months.\n", + "\n", + "Let's look at an example. Here we use the base `SubsetsMetricsAdapter`. We compute the same metrics as before, but apply the metrics calculator to a set of predefined temporal subgroups. Note that groups in this example were chosen to demonstrate the feature range of the `TsDistributor` rather than to produce meaningful results.\n", + "\n", + "We create a SubsetsMetricsAdapter that takes the original metrics calculator and a list of named, temporal subsets (dict).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "from pytesmo.validation_framework.metric_calculators_adapters import SubsetsMetricsAdapter, TsDistributor\n", + "from pytesmo.time_series.grouping import YearlessDatetime\n", + "from datetime import datetime\n", + "import shutil\n", + "\n", + "subsets = {\n", + " # The first subset does only include values in August 2010 and 2011 + July 31st of both years\n", + " 'August2010/11': TsDistributor(dates=[datetime(year=2009, month=7, day=31), datetime(2010, 7, 31)],\n", + " date_ranges=[(datetime(2010, 7, 1, 0, 0), datetime(2010, 7, 31)),\n", + " (datetime(2011, 7, 1), datetime(2011, 7, 31))]),\n", + " \n", + " # The second subset includes all values from June to September of ANY YEAR, but not August\n", + " 'JJS': TsDistributor(yearless_date_ranges=[\n", + " (YearlessDatetime(month=6, day=1, hour=0, minute=0), YearlessDatetime(7, 31)),\n", + " (YearlessDatetime(9, 1), YearlessDatetime(9, 30))\n", + " ]),\n", + " \n", + " # The third group includes all values from Feb 28th to the end of April of ANY YEAR\n", + " 'MarchApril': TsDistributor(yearless_date_ranges=[(YearlessDatetime(2, 28), YearlessDatetime(4, 30))]),\n", + "}\n", + "\n", + "adapted_intercomparison_metrics = SubsetsMetricsAdapter(\n", + " metrics_calculators.PairwiseIntercomparisonMetrics(calc_kendall=False, calc_spearman=False),\n", + " subsets,\n", + " group_results='join')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The adapted metrics calculator can now be used as before. The results will contain the chosen group names accordingly, as defined in the adapter. The results manager will write them to a netcdf file. Note that this only works as `group_results='join'` was selected, otherwise the output format would different from what the results manager expects." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{(('ASCAT', 'sm'), ('ISMN', 'soil_moisture')): {'August2010/11|BIAS': array([0.02893662], dtype=float32),\n", + " 'August2010/11|BIAS_ci_lower': array([0.01474692], dtype=float32),\n", + " 'August2010/11|BIAS_ci_upper': array([0.04312633], dtype=float32),\n", + " 'August2010/11|R': array([0.7176565], dtype=float32),\n", + " 'August2010/11|RMSD': array([0.04613708], dtype=float32),\n", + " 'August2010/11|RSS': array([0.05960163], dtype=float32),\n", + " 'August2010/11|R_ci_lower': array([0.47057065], dtype=float32),\n", + " 'August2010/11|R_ci_upper': array([0.8603755], dtype=float32),\n", + " 'August2010/11|mse': array([0.00212863], dtype=float32),\n", + " 'August2010/11|mse_bias': array([0.00083733], dtype=float32),\n", + " 'August2010/11|mse_corr': array([0.00128012], dtype=float32),\n", + " 'August2010/11|mse_var': array([1.1178111e-05], dtype=float32),\n", + " 'August2010/11|n_obs': array([28], dtype=int32),\n", + " 'August2010/11|p_R': array([1.718043e-05], dtype=float32),\n", + " 'August2010/11|status': array([0], dtype=int32),\n", + " 'August2010/11|urmsd': array([0.03593468], dtype=float32),\n", + " 'August2010/11|urmsd_ci_lower': array([0.02893201], dtype=float32),\n", + " 'August2010/11|urmsd_ci_upper': array([0.04980955], dtype=float32),\n", + " 'JJS|BIAS': array([0.02439872], dtype=float32),\n", + " 'JJS|BIAS_ci_lower': array([0.01300029], dtype=float32),\n", + " 'JJS|BIAS_ci_upper': array([0.03579714], dtype=float32),\n", + " 'JJS|R': array([0.43224052], dtype=float32),\n", + " 'JJS|RMSD': array([0.08166534], dtype=float32),\n", + " 'JJS|RSS': array([1.2204688], dtype=float32),\n", + " 'JJS|R_ci_lower': array([0.3063946], dtype=float32),\n", + " 'JJS|R_ci_upper': array([0.54323655], dtype=float32),\n", + " 'JJS|mse': array([0.00666923], dtype=float32),\n", + " 'JJS|mse_bias': array([0.0005953], dtype=float32),\n", + " 'JJS|mse_corr': array([0.00590448], dtype=float32),\n", + " 'JJS|mse_var': array([0.00016945], dtype=float32),\n", + " 'JJS|n_obs': array([183], dtype=int32),\n", + " 'JJS|p_R': array([9.96128e-10], dtype=float32),\n", + " 'JJS|status': array([0], dtype=int32),\n", + " 'JJS|urmsd': array([0.07793543], dtype=float32),\n", + " 'JJS|urmsd_ci_lower': array([0.07087912], dtype=float32),\n", + " 'JJS|urmsd_ci_upper': array([0.0870943], dtype=float32),\n", + " 'MarchApril|BIAS': array([-0.0636023], dtype=float32),\n", + " 'MarchApril|BIAS_ci_lower': array([-0.07745089], dtype=float32),\n", + " 'MarchApril|BIAS_ci_upper': array([-0.0497537], dtype=float32),\n", + " 'MarchApril|R': array([0.8920552], dtype=float32),\n", + " 'MarchApril|RMSD': array([0.07466082], dtype=float32),\n", + " 'MarchApril|RSS': array([0.1895241], dtype=float32),\n", + " 'MarchApril|R_ci_lower': array([0.7931545], dtype=float32),\n", + " 'MarchApril|R_ci_upper': array([0.94511515], dtype=float32),\n", + " 'MarchApril|mse': array([0.00557424], dtype=float32),\n", + " 'MarchApril|mse_bias': array([0.00404525], dtype=float32),\n", + " 'MarchApril|mse_corr': array([0.00138048], dtype=float32),\n", + " 'MarchApril|mse_var': array([0.0001485], dtype=float32),\n", + " 'MarchApril|n_obs': array([34], dtype=int32),\n", + " 'MarchApril|p_R': array([1.4273317e-12], dtype=float32),\n", + " 'MarchApril|status': array([0], dtype=int32),\n", + " 'MarchApril|urmsd': array([0.03910225], dtype=float32),\n", + " 'MarchApril|urmsd_ci_lower': array([0.03201326], dtype=float32),\n", + " 'MarchApril|urmsd_ci_upper': array([0.05224345], dtype=float32),\n", + " 'gpi': array([0], dtype=int32),\n", + " 'lat': array([33.8833]),\n", + " 'lon': array([102.1333])}}\n", + "Results were stored in /tmp/tmppeisd__e/ascat_ismn_adapted\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from pytesmo.validation_framework.temporal_matchers import make_combined_temporal_matcher\n", + "process = Validation(\n", + " datasets, 'ISMN',\n", + " temporal_ref='ASCAT',\n", + " scaling='mean_std',\n", + " scaling_ref='ISMN',\n", + " metrics_calculators={(2, 2): adapted_intercomparison_metrics.calc_metrics},\n", + " temporal_matcher=make_combined_temporal_matcher(pd.Timedelta(6, \"H\")),\n", + " period=period)\n", + "\n", + "save_path = output_folder / 'ascat_ismn_adapted'\n", + "\n", + "if os.path.exists(save_path):\n", + " shutil.rmtree(save_path)\n", + "\n", + "for job in jobs:\n", + " results = process.calc(*job)\n", + " pprint(results)\n", + " netcdf_results_manager(results, save_path)\n", + "\n", + "print(f\"Results were stored in {save_path}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -489,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -521,27 +713,41 @@ "\n", "setup_code = \"my_validation.py\"\n", "start_validation(setup_code)\n", - "```\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Time Series Adapters\n", "\n", - "## Masking datasets\n", + "Data readers extract time series from the candidate and reference data sets used in a validation run. However, often it is desired to change the data in some way after reading and before using them in the validation framework. Potential preprocessing steps include masking/filtering, conversion to anomalies and the combining multiple available variables in a dataset.\n", + "\n", + "## Masking / filtering\n", "\n", - "Masking datasets are datasets that return a pandas DataFrame with boolean values. `True` means that the observation\n", - " should be masked, `False` means it should be kept. All masking datasets are temporally matched in pairs to the\n", - "temporal reference dataset. Only observations for which all masking datasets have a value of `False` are kept for\n", - "further validation.\n", + "Filters are used to select certain observations in a time series after reading to include/exclude from the validation run. Satellite and in situ observations often come with quality flags for their measurements. Filters are for example used to choose - based on these flags - which observations should be used and which ones should be discarded before computing validation metrics.\n", "\n", - "The masking datasets have the same format as the dataset dictionary and can be specified in the Validation class\n", - "with the `masking_datasets` keyword.\n", + "There are 2 ways of masking input datasets.\n", + "\n", + "1) Directly upon loading a time series from a dataset, by removing any unwanted observations\n", + " For this the `SelfMaskingAdapter` and `AdvancedMaskingAdapter` are used.\n", + "2) Indirectly after reading all datasets by using one or multiple independent masking datasets.\n", + " For this the `MaskingAdapter` is used. The masking datasets have the same format as the dataset dictionary and can be specified in the Validation class with the `masking_datasets` keyword.\n", "\n", "### Masking adapter\n", "\n", - "To easily transform an existing dataset into a masking dataset `pytesmo` offers a adapter class that calls the\n", + "To easily transform an existing dataset into a masking dataset `pytesmo` offers an adapter class that calls the\n", "reading method of an existing dataset and creates a masking dataset based on an operator, a given threshold, and (optionally) a column name." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -554,23 +760,23 @@ "2008-07-01 01:00:00 False\n", "2008-07-01 02:00:00 False\n", "2008-07-01 03:00:00 False\n", - "2008-07-01 04:00:00 False\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_19692/3230815451.py:3: DeprecationWarning: `MaskingAdapter` is deprecated, use `SelfMaskingAdapter` or `AdvancedMaskingAdapter` instead.\n", - " ds_mask = MaskingAdapter(ismn_reader, '<', 0.2, 'soil_moisture')\n" + "2008-07-01 04:00:00 False\n", + "... ...\n", + "2010-07-31 19:00:00 False\n", + "2010-07-31 20:00:00 False\n", + "2010-07-31 21:00:00 False\n", + "2010-07-31 22:00:00 False\n", + "2010-07-31 23:00:00 False\n", + "\n", + "[15927 rows x 1 columns]\n" ] } ], "source": [ "from pytesmo.validation_framework.adapters import MaskingAdapter\n", "\n", - "ds_mask = MaskingAdapter(ismn_reader, '<', 0.2, 'soil_moisture')\n", - "print(ds_mask.read(ids[0]).head())" + "ds_mask = MaskingAdapter(ismn_reader, '<', 0.2, 'soil_moisture')\n", + "pprint(ds_mask.read(ids[0]))" ] }, { @@ -578,12 +784,12 @@ "metadata": {}, "source": [ "### Self-masking adapter\n", - "`pytesmo` also has a class that masks a dataset \"on-the-fly\", based on one of the columns it contains and an operator and a threshold. In contrast to the masking adapter mentioned above, the output of the self-masking adapter is the masked data, not the the mask. The self-masking adapter wraps a data reader, which must have a `read_ts` or `read` method. Alternatively, to use a method with a name other than `read`/`read_ts`, use the `read_name` keyword which is available for each Adapter - it is still required that the method returns a pandas DataFrame! Calling the reading method will return the masked data - more precisely a DataFrame with only rows where the masking condition is true." + "`pytesmo` also has a class that masks a dataset \"on-the-fly\", based on one of the columns it contains and an operator and a threshold. In contrast to the masking adapter mentioned above, the output of the self-masking adapter is the masked data, not the mask. The self-masking adapter wraps a data reader, which must have a `read_ts` or `read` method. Alternatively, to use a method with a name other than `read`/`read_ts`, use the `read_name` keyword which is available for each Adapter - it is still required that the method returns a pandas DataFrame! Calling the reading method will return the masked data - more precisely a DataFrame with only rows where the masking condition is true." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -596,16 +802,212 @@ "2008-11-29 10:00:00 0.19 D01,D03 M\n", "2008-11-29 11:00:00 0.19 D01,D03 M\n", "2008-11-30 03:00:00 0.19 D01,D03 M\n", - "2008-11-30 04:00:00 0.19 D01,D03 M\n" + "2008-11-30 04:00:00 0.19 D01,D03 M\n", + "... ... ... ...\n", + "2010-03-16 08:00:00 0.18 D01 M\n", + "2010-03-16 09:00:00 0.18 D01 M\n", + "2010-03-16 10:00:00 0.18 D01 M\n", + "2010-03-16 11:00:00 0.18 D01 M\n", + "2010-03-16 12:00:00 0.19 D01 M\n", + "\n", + "[2956 rows x 3 columns]\n" ] } ], "source": [ "from pytesmo.validation_framework.adapters import SelfMaskingAdapter\n", "\n", - "ds_mask = SelfMaskingAdapter(ismn_reader, '<', 0.2, 'soil_moisture')\n", - "print(ds_mask.read(ids[0]).head())" + "ds_mask = SelfMaskingAdapter(ismn_reader, '<', 0.2, 'soil_moisture', read_name='read')\n", + "pprint(ds_mask.read(ids[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "While the `SelfMaskingAdapter` works only one one filtering rule, there is also the `AdvancedMaskingAdapter`. This one can take a list of conditions based on which a time series is masked." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " sm snow_prob proc_flag orbit_dir\n", + "2007-02-12 14:47:48.999982 16.0 0 0 b'A'\n", + "2007-02-20 13:42:25.999977 35.0 0 0 b'A'\n", + "2007-03-01 13:56:14.999987 37.0 0 0 b'A'\n", + "2007-03-06 13:52:51.000007 35.0 0 0 b'A'\n", + "2007-03-08 14:51:29.999990 36.0 0 0 b'A'\n", + "... ... ... ... ...\n", + "2014-06-20 13:54:58.000002 80.0 0 0 b'A'\n", + "2014-06-22 14:53:39.000015 77.0 0 0 b'A'\n", + "2014-06-25 13:51:37.000006 83.0 0 0 b'A'\n", + "2014-06-27 14:50:18.000019 81.0 0 0 b'A'\n", + "2014-06-30 13:48:17.999999 83.0 0 0 b'A'\n", + "\n", + "[656 rows x 4 columns]\n" + ] + } + ], + "source": [ + "from pytesmo.validation_framework.adapters import AdvancedMaskingAdapter\n", + "\n", + "\n", + "ds_mask = AdvancedMaskingAdapter(ascat_reader,\n", + " filter_list=[('snow_prob', np.less_equal, 10),\n", + " ('sm', '>', 0),\n", + " ('sm', '<', 100),\n", + " ('proc_flag', '==', 0),\n", + " ('orbit_dir', '==', b'A')],\n", + " read_name='read')\n", + "pprint(ds_mask.read(1814367)[['sm', 'snow_prob', 'proc_flag', 'orbit_dir']])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## AnomalyAdapter and AnomalyClimAdapter\n", + "\n", + "These 2 adapters are used to transform absolute values into anomalies on-the-fly after reading. You can select one or multiple columns for which the anomalies are computed. Additional kwargs (like `timespan`) are passed to the underlying anomaly and climatology function. For more details there is a separate tutorial available." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " soil_moisture soil_moisture_flag soil_moisture_orig_flag\n", + "date_time \n", + "2008-07-01 00:00:00 0.140861 C03 M\n", + "2008-07-01 01:00:00 0.140861 C03 M\n", + "2008-07-01 02:00:00 0.140861 C03 M\n", + "2008-07-01 03:00:00 0.140861 C03 M\n", + "2008-07-01 04:00:00 0.140861 C03 M\n", + "... ... ... ...\n", + "2010-07-31 19:00:00 -0.151816 U M\n", + "2010-07-31 20:00:00 -0.151816 U M\n", + "2010-07-31 21:00:00 -0.151816 U M\n", + "2010-07-31 22:00:00 -0.151816 U M\n", + "2010-07-31 23:00:00 -0.151816 U M\n", + "\n", + "[15927 rows x 3 columns]\n" + ] + }, + { + "data": { + "text/plain": "" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": "
", + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pytesmo.validation_framework.adapters import AnomalyClimAdapter\n", + "ds_mask = AnomalyClimAdapter(ismn_reader, columns=['soil_moisture'], \n", + " timespan=(datetime(1991,1,1), datetime(2020,12,31)))\n", + "anom = ds_mask.read(ids[0])\n", + "pprint(anom)\n", + "anom.plot(title='Anomaly (wrt. 1991-2020 avg.)', ylabel='SM m3m-3', figsize=(7,2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## ColumnCombineAdapter\n", + "\n", + "This adapter is used to combine multiple columns of a dataset after reading the time series. It takes any function that can be applied accross multiple columns and will add a new column of the chosen name to the data frame.\n", + "E.g. in the following example we combine the `proc_flag` and `snow_prob` and `frozen_prob` column into a new column `good` that is 'True' when all the chosen columns are 0 and otherwise 'False'. Then we apply a `SelfMaskingAdapter` that uses the newly added column to filter the dataset. This is just for demonstration, we could also just use the `AdvancedMaskingAdapter` for this example. A more common use case for the `ColumnCombineAdapter` would be to compute the average when multiple soil moisture fields are available in a dataset.\n", + "\n", + "This example also shows that you can stack multiple adapters together. If they depend on each other, it is important to notice that the innermost adapter will be called first!" ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " sm proc_flag snow_prob frozen_prob good\n", + "2007-06-03 03:33:38.999992 84.0 0 0 0 True\n", + "2007-06-03 14:51:50.999997 82.0 0 0 0 True\n", + "2007-06-06 02:31:33.000005 76.0 0 0 0 True\n", + "2007-06-06 13:49:46.999999 75.0 0 0 0 True\n", + "2007-06-08 03:30:13.000023 83.0 0 0 0 True\n", + "... ... ... ... ... ...\n", + "2014-06-25 13:51:37.000006 83.0 0 0 0 True\n", + "2014-06-27 03:32:04.999979 79.0 0 0 0 True\n", + "2014-06-27 14:50:18.000019 81.0 0 0 0 True\n", + "2014-06-30 02:30:05.000000 77.0 0 0 0 True\n", + "2014-06-30 13:48:17.999999 83.0 0 0 0 True\n", + "\n", + "[713 rows x 5 columns]\n" + ] + } + ], + "source": [ + "from pytesmo.validation_framework.adapters import ColumnCombineAdapter, SelfMaskingAdapter\n", + "\n", + "def select_good(x):\n", + " return (x['proc_flag'] == 0) and (x['snow_prob'] == 0) and (x['frozen_prob'] == 0)\n", + "\n", + "ds_mask = SelfMaskingAdapter(ColumnCombineAdapter(ascat_reader, func=select_good, new_name='good'), \n", + " '==', True, 'good')\n", + "pprint(ds_mask.read(1814367)[['sm', 'proc_flag', 'snow_prob', 'frozen_prob', 'good']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -624,7 +1026,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.9" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/src/pytesmo/time_series/grouping.py b/src/pytesmo/time_series/grouping.py index 1e905f6c..180a8919 100644 --- a/src/pytesmo/time_series/grouping.py +++ b/src/pytesmo/time_series/grouping.py @@ -26,9 +26,6 @@ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at -# Creation date: 2014-06-30 - """ Module provides grouping functions that can be used together with pandas @@ -36,12 +33,16 @@ there are three products per month with timestamps on the 10th 20th and last of the month """ +from dataclasses import dataclass +from typing import Optional, Union, Tuple, List import pandas as pd import numpy as np -from datetime import date +from datetime import date, datetime import calendar +from cadati.conv_doy import doy + def group_by_day_bin(df, bins=[1, 11, 21, 32], start=False, dtindex=None): @@ -153,3 +154,239 @@ def grouped_dates_between(start_date, end_date, bins=[1, 11, 21, 32], start=Fals tstamps = grp.sum().index.to_pydatetime().tolist() return tstamps + + +@dataclass +class YearlessDatetime: + """ + Container class to store Datetime information without a year. This is + used to group data when the year is not relevant (e.g. seasonal analysis). + Only down to second. Used by + :class:`pytesmo.validation_framework.metric_calculators_adapters.TsDistributor` + """ + month: int + + day: int = 1 + hour: int = 0 + minute: int = 0 + second: int = 0 + + @property + def __ly(self): + return 2400 # arbitrary leap year + + def __ge__(self, other: 'YearlessDatetime'): + return self.to_datetime(self.__ly) >= other.to_datetime(self.__ly) + + def __le__(self, other: 'YearlessDatetime'): + return self.to_datetime(self.__ly) <= other.to_datetime(self.__ly) + + def __lt__(self, other: 'YearlessDatetime'): + return self.to_datetime(self.__ly) < other.to_datetime(self.__ly) + + def __gt__(self, other: 'YearlessDatetime'): + return self.to_datetime(self.__ly) > other.to_datetime(self.__ly) + + def __repr__(self): + return f"****-{self.month:02}-{self.day:02}" \ + f"T{self.hour:02}:{self.minute:02}:{self.second:02}" + + @property + def doy(self) -> int: + """ + Get day of year for this date. Assume leap year! + i.e.: 1=Jan.1st, 366=Dec.31st, 60=Feb.29th. + """ + return doy(self.month, self.day, year=None) + + @classmethod + def from_datetime(cls, dt: datetime): + """ + Omit year from passed datetime to create generic datetime. + """ + return cls(dt.month, dt.day, dt.hour, dt.minute, dt.second) + + def to_datetime(self, years: Optional[Union[Tuple[int, ...], int]]) \ + -> Union[datetime, List, None]: + """ + Convert generic datetime to datetime with year. + Feb 29th for non-leap-years will return None + """ + dt = [] + + for year in np.atleast_1d(years): + if not calendar.isleap(year) and self.doy == 60.: + continue + else: + d = datetime(year, self.month, self.day, self.hour, + self.minute, self.second) + dt.append(d) + + if len(dt) == 1: + return dt[0] + elif len(dt) == 0: + return None + else: + return dt + + +class TsDistributor: + + def __init__(self, + dates=None, + date_ranges=None, + yearless_dates=None, + yearless_date_ranges=None): + """ + Build a data distibutor from individual dates, date ranges, generic + dates (without specific year) and generic date ranges. + + Components: + - individual datetime objects for distinct dates + - generic datetime objects for dates without specific a year + - date range / datetime tuple + i.e. ALL datetimes between the 2 passed dates (start, end) + the start date must be earlier than the end date + - generic date range / generic datetime tuple + i.e. ALL datetimes between 2 generic dates (for any year) + + Parameters + ---------- + dates : Tuple[datetime, ...] or Tuple[str, ...] or pd.DatetimeIndex + Individual dates (that also have a year assigned). + date_ranges: Tuple[Tuple[datetime, datetime], ...] + A list of date ranges, consisting of a start and end date for each + range. The start date must be earlier in time than the end date. + yearless_dates: Tuple[YearlessDatetime,...] or Tuple[datetime...] + A list of generic dates (that apply to any year). + Can be passed as a list of + - YearlessDatetime objects + e.g. YearlessDatetime(5,31,12,1,10), ie. May 31st 12:01:10 + - pydatetime objects (years will be ignored, duplicates dropped) + yearless_date_ranges: [Tuple[YearlessDatetime, YearlessDatetime], ...] + A list of generic date ranges (that apply to any year). + """ + + self.dates = dates + self.date_ranges = date_ranges + self.yearless_dates = yearless_dates + self.yearless_date_ranges = yearless_date_ranges + + def __repr__(self): + s = [] + for var in ['dates', 'date_ranges', 'yearless_dates', + 'yearless_date_ranges']: + val = getattr(self, var) + s.append(f"#{var}={len(val) if val is not None else 0}") + + return f"{self.__class__.__name__}({', '.join(s)})" + + def select(self, + df: Union[pd.DataFrame, pd.Series, pd.DatetimeIndex], + set_nan=False): + """ + Select rows from data frame or series with mathing date time indices. + + Parameters + ---------- + df: pd.DataFrame or pd.Series + Must have a date time index, which is then filtered based on the + dates. + set_nan: bool, optional (default: False) + Instead of dropping rows that are not selected, set their values to + nan. + + + Returns + ------- + df: pd.DataFrame or pd.Series + The filterd input data + + """ + if isinstance(df, pd.DatetimeIndex): + idx = df + else: + idx = df.index + + if not isinstance(idx, pd.DatetimeIndex): + raise ValueError(f"Expected a DatetimeIndex, " + f"but got {type(df.index)}.") + + mask = self.filter(idx) + + if set_nan: + df[~mask] = np.nan + return df + else: + return df[mask] + + def filter(self, idx: pd.DatetimeIndex): + """ + Filter datetime index for a TimeSeriesDistributionSet + + Parameters + ---------- + idx: pd.DatetimeIndex + Datetime index to split using the set + + Returns + ------- + idx_filtered: pd.DatetimeIndex + Filtered Index that contains dates for the set + """ + + mask = pd.DataFrame(index=idx.copy()) + + if self.dates is not None: + _idx_dates = idx.intersection(pd.DatetimeIndex(self.dates)) + mask['dates'] = False + mask.loc[_idx_dates, 'dates'] = True + + if self.date_ranges is not None: + for i, drange in enumerate(self.date_ranges): + start, end = drange[0], drange[1] + if start > end: + start, end = end, start + mask[f"range{i}"] = (idx >= start) & (idx <= end) + + if self.yearless_dates is not None: + arrs = np.array([]) + for d in self.yearless_dates: + dts = d.to_datetime(np.unique(idx.year)) + if dts is None: + continue + else: + arrs = np.append(arrs, dts) + _idx_dates = idx.intersection(pd.DatetimeIndex(arrs)) + mask['gen_dates'] = False + mask.loc[_idx_dates, 'gen_dates'] = True + + # avoid loop like: + # cond = ["__index_month == {}".format(m) for m in months] + # selection = dat.query(" | ".join(cond)).index + + if self.yearless_date_ranges is not None: + for i, gdrange in enumerate(self.yearless_date_ranges): + for y in np.unique(idx.year): + + if not calendar.isleap(y) and (gdrange[0].doy == 60): + start = YearlessDatetime(3, 1) + else: + start = gdrange[0] + + if (not calendar.isleap(y)) and (gdrange[1].doy == 60): + end = YearlessDatetime(2, 28, 23, 59, 59) + else: + end = gdrange[1] + + start_dt = start.to_datetime(years=y) + + if end < start: + end_dt = end.to_datetime(years=y + 1) + else: + end_dt = end.to_datetime(years=y) + + mask[f"gen_range{y}-{i}"] = (idx >= start_dt) & ( + idx <= end_dt) + + return mask.any(axis=1, bool_only=True) diff --git a/src/pytesmo/validation_framework/adapters.py b/src/pytesmo/validation_framework/adapters.py index 8229adf5..115d71ba 100644 --- a/src/pytesmo/validation_framework/adapters.py +++ b/src/pytesmo/validation_framework/adapters.py @@ -31,10 +31,8 @@ import operator -import pandas as pd from pytesmo.time_series.anomaly import calc_anomaly from pytesmo.time_series.anomaly import calc_climatology -from pytesmo.utils import deprecated from pandas import DataFrame import numpy as np import warnings @@ -90,9 +88,10 @@ def __init__(self, cls, data_property_name="data", read_name=None): setattr(self, read_name, self._adapt_custom) def __get_dataframe(self, data): - if ((not isinstance(data, DataFrame)) and - (hasattr(data, self.data_property_name)) and - (isinstance(getattr(data, self.data_property_name), DataFrame))): + if (not isinstance(data, DataFrame)) and \ + (hasattr(data, self.data_property_name)) and \ + (isinstance(getattr(data, self.data_property_name), + DataFrame)): data = getattr(data, self.data_property_name) return data @@ -131,14 +130,14 @@ def grid(self): return self.cls.grid -@deprecated("`MaskingAdapter` is deprecated, use `SelfMaskingAdapter` " - "or `AdvancedMaskingAdapter` instead.") class MaskingAdapter(BasicAdapter): """ Transform the given class to return a boolean dataset given the operator and threshold. This class calls the read_ts and read methods of the given instance and applies boolean masking to the returned data - using the given operator and threshold. + using the given operator and threshold. This adapter does not filter the + time series (see the AdvancedMaskingAdapter and SelfMaskingAdapter for + that) but only turns it into a boolean dataset. Parameters ---------- @@ -267,8 +266,8 @@ class AdvancedMaskingAdapter(BasicAdapter): Reader object, has to have a `read_ts` or `read` method or a method name must be specified in the `read_name` kwarg. The same method will be available for the adapted version of the reader. - filter_list: list[tuple, tuple, tuple] - [column_name, operator, threshold] + filter_list: list[tuple] + [(column_name, operator, threshold), ...] 'column_name': string name of the column to apply the operator to 'operator': Callable or str; @@ -555,16 +554,17 @@ class TimestampAdapter(BasicAdapter): name must be specified in the `read_name` kwarg. The same method will be available for the adapted version of the reader. time_offset_fields: str, list or None - name or list of names of the fields that provide information on the time offset. - If a list is given, all values will contribute to the offset, assuming that - each refers to the previous. For instance: + name or list of names of the fields that provide information on the + time offset. + If a list is given, all values will contribute to the offset, assuming + that each refers to the previous. For instance: offset = minutes + seconds in the minute + µs in the second NOTE: np.nan values are counted as 0 offset NOTE: if None, no offset is considered time_units: str or list - time units that the time_offset_fields are specified in. If a list is given, - it should have the same size as the 'time_offset_fields' parameter - Can be any of the np.datetime[64] units: + time units that the time_offset_fields are specified in. If a list is + given, it should have the same size as the 'time_offset_fields' + parameter. Can be any of the np.datetime[64] units: https://numpy.org/doc/stable/reference/arrays.datetime.html base_time_field: str, optional. Default is None. If a name is provided, the generic time field will be searched for @@ -573,19 +573,20 @@ class TimestampAdapter(BasicAdapter): base_time_reference: str, optional. Default is None. String of format 'YYYY-mm-dd' that can be specified to tranform the 'base_time_field' from [units since base_time_reference] to - np.datetime[64]. If not provided, it will be assumed that the base_time_field - is already in np.datetime[64] units + np.datetime[64]. If not provided, it will be assumed that the + base_time_field is already in np.datetime[64] units base_time_units: str, optional. Default is "D" - Units that the base_time_field is specified in. Only applicable with 'base_time_reference' + Units that the base_time_field is specified in. Only applicable with + 'base_time_reference' replace_index: bool, optional. Default is True. If True, the exact timestamp is used as index. Else, it will be added to the dataframe on the column 'output_field' output_field: str, optional. Default is None. - If a name is specified, an additional column is generated under the name, - with the exact timestamp. Only with 'replace_index' == False + If a name is specified, an additional column is generated under the + name, with the exact timestamp. Only with 'replace_index' == False drop_original: bool, optional. Default is True. - Whether the base_time_field and time_offset_fields should be dropped in the - final DataFrame + Whether the base_time_field and time_offset_fields should be dropped + in the final DataFrame """ def __init__(self, @@ -617,12 +618,13 @@ def __init__(self, if not replace_index and output_field is None: raise ValueError( "'output_field' should be specified in case the new timestamp" - "should not be used as index. Alternatively, set 'replace_index' to True" + "should not be used as index. Alternatively, set " + "'replace_index' to True" ) elif replace_index and output_field is not None: warnings.warn( - "Ignoring the 'output_field' value. Set 'replace_index' to True to" - "avoid this behavior") + "Ignoring the 'output_field' value. Set 'replace_index' to " + "True to avoid this behavior") else: self.output_field = output_field @@ -639,7 +641,9 @@ def convert_generic(self, return time_date def add_offset_cumulative(self, data: DataFrame) -> np.array: - """Return an array of timedelta calculated with all the time_offset_fields""" + """ + Return an array of timedelta calculated with all the time_offset_fields + """ total_offset = np.full(data.index.shape, 0, dtype='timedelta64[s]') for field, unit in zip(self.time_offset_fields, self.time_units): total_offset += data[field].map( diff --git a/src/pytesmo/validation_framework/metric_calculators_adapters.py b/src/pytesmo/validation_framework/metric_calculators_adapters.py index dd29a9e2..d6dad14a 100644 --- a/src/pytesmo/validation_framework/metric_calculators_adapters.py +++ b/src/pytesmo/validation_framework/metric_calculators_adapters.py @@ -24,25 +24,31 @@ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - """ Metric Calculator Adapters change how metrics are calculated by calling the `calc_metric` function of the adapted calculator instead of the unadapted version. """ +from pytesmo.time_series.grouping import YearlessDatetime, TsDistributor from pytesmo.validation_framework.metric_calculators import ( - PairwiseIntercomparisonMetrics, - TripleCollocationMetrics -) + PairwiseIntercomparisonMetrics, TripleCollocationMetrics) import warnings import numpy as np +from cadati.conv_doy import days_past -class MonthsMetricsAdapter(object): +def days_in_month(month: int) -> int: """ - Adapt MetricCalculators to calculate metrics for groups across months + Get number of days in this month (in a LEAP YEAR) + """ + return days_past[month] - days_past[month - 1] + + +class SubsetsMetricsAdapter: + """ + Adapt MetricCalculators to calculate metrics for groups of temporal + subsets (also across multiple years). """ _supported_metric_calculators = ( @@ -50,101 +56,64 @@ class MonthsMetricsAdapter(object): TripleCollocationMetrics, ) - def __init__(self, calculator, sets=None): + def __init__(self, calculator, subsets, group_results='tuple'): """ Add functionality to a metric calculator to calculate validation - metrics for subsets of certain months in a time series (e.g. seasonal). + metrics for subsets of certain datetimes in a time series + (e.g. seasonal). Parameters ---------- calculator : PairwiseIntercomparisonMetrics or TripleCollocationMetrics A metric calculator to adapt. Preferably an instance of a metric calculator listed in `_supported_metric_calculators` - sets : dict, optional (default: None) - Define groups of data. With group names as key and a list of - months (1-12) that belong to the group as values. - - e.g. {'Group1': [4,5,6,7,8,9], 'Group2': [10,11,12,1,2,3]} will - split the data used by the metric calculator into 2 groups. - One using only observations made between April and September, - and one using observations from the rest of the year. - - The name will be used in the results to distinguish between the - same metrics for different groups: - e.g. ('Group1', 'BIAS'): ..., ('Group2', 'BIAS'): ..., etc. - - The default groups are based on 4 seasons plus one group that uses - all data (as the unadapted metric calculator would do): - {'DJF': [12,1,2], 'MAM': [3,4,5], 'JJA': [6, 7, 8], - 'SON': [9, 10, 11], 'ALL': list(range(1, 13))} + subsets : dict[str, TsDistributor], optional (default: None) + Define subsets of data. With group names as key and a + data distributor as values. + group_results: str, optional (default: 'tuple') + How to group the results. + - 'tuple' will group the results by (group, metric) + - 'join' will join group and metric name with a '|' """ if not isinstance(calculator, self._supported_metric_calculators): warnings.warn(f"Adapting {calculator.__class__} is not supported.") + self.cls = calculator - if sets is None: - sets = { - "DJF": [12, 1, 2], - "MAM": [3, 4, 5], - "JJA": [6, 7, 8], - "SON": [9, 10, 11], - "ALL": list(range(1, 13)), - } + self.subsets = subsets + self.group_results = group_results - self.sets = sets + assert group_results in ('tuple', 'join'), \ + f"Unknown group_results: {group_results}" # metadata metrics and lon, lat, gpi are excluded from applying # seasonally self.non_seas_metrics = ["gpi", "lon", "lat"] - if self.cls.metadata_template is not None: - self.non_seas_metrics += list(self.cls.metadata_template.keys()) + if hasattr(self.cls, 'metadata_template'): + if self.cls.metadata_template is not None: + self.non_seas_metrics += list( + self.cls.metadata_template.keys()) all_metrics = calculator.result_template subset_metrics = {} # for each subset create a copy of the metric template - for name in sets.keys(): + for name in subsets.keys(): for k, v in all_metrics.items(): - if k in self.non_seas_metrics: - subset_metrics[k] = np.array(v) - else: - subset_metrics[(name, k)] = np.array(v) + subset_metrics[self._genname(name, k)] = np.array(v) self.result_template = subset_metrics - @staticmethod - def filter_months(df, months, dropna=False): - """ - Select only entries of a time series that are within certain month(s) - - Parameters - ---------- - df : pd.DataFrame - Time series (index.month must exist) that is split up into the - selected groups. - months : list - Months for which data is kept, e.g. [12,1,2] to keep data for - winter - dropna : bool, optional (default: False) - Drop lines for months that are not to be kept, if this is false, - the original index is not changed, but filtered values are replaced - with nan. - - Returns - ------- - df_filtered : pd.DataFrame - The filtered series - """ - dat = df.copy(True) - dat["__index_month"] = dat.index.month - cond = ["__index_month == {}".format(m) for m in months] - selection = dat.query(" | ".join(cond)).index - dat.drop("__index_month", axis=1, inplace=True) - - if dropna: - return dat.loc[selection] + def _genname(self, setname: str, metric: str) -> str or tuple: + if metric in self.non_seas_metrics: + k = f"{metric}" + elif self.group_results == 'tuple': + k = (f"{setname}", *np.atleast_1d(metric)) + elif self.group_results == 'join': + k = f"{setname}|{metric}" else: - dat.loc[dat.index.difference(selection)] = np.nan - return dat + raise NotImplementedError( + f"Unknown group_results: {self.group_results}") + return k def calc_metrics(self, data, gpi_info): """ @@ -161,14 +130,63 @@ def calc_metrics(self, data, gpi_info): """ dataset = self.result_template.copy() - for setname, months in self.sets.items(): - df = self.filter_months(data, months=months, dropna=True) + for setname, distr in self.subsets.items(): + df = distr.select(data) ds = self.cls.calc_metrics(df, gpi_info=gpi_info) for metric, res in ds.items(): - if metric in self.non_seas_metrics: - k = f"{metric}" - else: - k = (f"{setname}", *np.atleast_1d(metric)) + k = self._genname(setname, metric) dataset[k] = res return dataset + + +class MonthsMetricsAdapter(SubsetsMetricsAdapter): + """ + Adapt MetricCalculators to calculate metrics for groups across months + """ + + def __init__(self, calculator, sets=None): + """ + Add functionality to a metric calculator to calculate validation + metrics for subsets of certain months in a time series (e.g. seasonal). + + Parameters + ---------- + calculator : PairwiseIntercomparisonMetrics or TripleCollocationMetrics + A metric calculator to adapt. Preferably an instance of a metric + calculator listed in `_supported_metric_calculators` + sets : dict, optional (default: None) + Define groups of data. With group names as key and a list of + months (1-12) that belong to the group as values. + + e.g. {'Group1': [4,5,6,7,8,9], 'Group2': [10,11,12,1,2,3]} will + split the data used by the metric calculator into 2 groups. + One using only observations made between April and September, + and one using observations from the rest of the year. + + The name will be used in the results to distinguish between the + same metrics for different groups: + e.g. ('Group1', 'BIAS'): ..., ('Group2', 'BIAS'): ..., etc. + + The default groups are based on 4 seasons plus one group that uses + all data (as the unadapted metric calculator would do): + {'DJF': [12,1,2], 'MAM': [3,4,5], 'JJA': [6, 7, 8], + 'SON': [9, 10, 11], 'ALL': list(range(1, 13))} + """ + if sets is None: + sets = { + 'DJF': [12, 1, 2], + 'MAM': [3, 4, 5], + 'JJA': [6, 7, 8], + 'SON': [9, 10, 11], + 'ALL': list(range(1, 13)), + } + + for name, months in sets.items(): + distr = TsDistributor(yearless_date_ranges=[( + YearlessDatetime(m, 1, 0, 0, 0), + YearlessDatetime(m, days_in_month(m), 23, 59, 59)) + for m in months]) + sets[name] = distr + + super().__init__(calculator, subsets=sets) diff --git a/src/pytesmo/validation_framework/validation.py b/src/pytesmo/validation_framework/validation.py index 72d37592..cacfe414 100644 --- a/src/pytesmo/validation_framework/validation.py +++ b/src/pytesmo/validation_framework/validation.py @@ -98,7 +98,7 @@ class Validation(object): method of these datasets has to return pandas.DataFrames with only boolean columns. True means that the observations at this timestamp should be masked and False means that it should be kept. - scaling : string, None or class instance + scaling : str or None or class instance - If set then the data will be scaled into the reference space using the method specified by the string using the :py:class:`pytesmo.validation_framework.data_scalers.DefaultScaler` @@ -262,7 +262,8 @@ def calc( ) except Exception as e: raise eh.DataManagerError( - f"Getting the data for gpi {gpi_info} failed!") + f"Getting the data for gpi {gpi_info} failed with" + f" error: {e}") # if no data is available continue with the next gpi if len(df_dict) == 0: @@ -453,7 +454,7 @@ def dummy_result(): except Exception as e: raise eh.ScalingError( f"Scaling failed for {result_key} for gpi" - f" {gpi_info}!" + f" {gpi_info} with error {e}!" ) # Drop the scaling reference if it was not in the intended diff --git a/tests/test_validation_framework/test_metric_calculators_adapters.py b/tests/test_validation_framework/test_metric_calculators_adapters.py new file mode 100644 index 00000000..a442b8fc --- /dev/null +++ b/tests/test_validation_framework/test_metric_calculators_adapters.py @@ -0,0 +1,131 @@ +import unittest + +import pandas as pd +import numpy as np + +from pytesmo.time_series.grouping import YearlessDatetime, TsDistributor +from datetime import datetime + + +class Test_YearlessDateTime(unittest.TestCase): + + def setUp(self) -> None: + self.past = datetime(1900, 1, 2, 3, 4, 5) + self.future = datetime(2104, 6, 7, 8, 9, 10) + self.yearless = YearlessDatetime(3, 10, 0, 0, 0) + + def test_comparisons(self): + assert self.yearless > YearlessDatetime.from_datetime(self.past) + assert self.yearless < YearlessDatetime.from_datetime(self.future) + assert self.yearless == self.yearless + + def test_doy(self): + assert YearlessDatetime.from_datetime(self.future).doy == 159 + assert YearlessDatetime(12, 31).doy == 366 + assert YearlessDatetime(2, 29).doy == 60 + assert YearlessDatetime(1, 1).doy == 1 + + def test_to_dt(self): + assert YearlessDatetime.from_datetime(self.past).to_datetime( + self.past.year) == self.past + assert YearlessDatetime.from_datetime( + self.future).to_datetime(years=[2104, 2111])[0] == self.future + + +class Test_TimeSeriesDistributionSet(unittest.TestCase): + + def setUp(self) -> None: + df = pd.DataFrame( + index=pd.date_range('2000-01-01T12', '2009-12-31T12', freq='D')) + df['data'] = np.random.rand(df.index.size) + df.loc[np.isin(df.index.month, [1, 7])] = np.nan + df = df.dropna() + self.df = df + + def test_filter_dates_only(self): + dates = ( + datetime(2005, 6, 6, 12), + datetime(2005, 5, 5, 12), + datetime(2005, 4, 4, 12), + datetime(2005, 3, 3, 12), + datetime(2005, 2, 2, 12), + datetime(2005, 2, 2, 1), # not in input/output !! + datetime(2005, 1, 5, 12), # not in input/output !! + ) + + set1 = TsDistributor(dates=dates) + + d = set1.select(self.df) + assert len(d.index) == 5 + assert np.all(dt in d.index for dt in dates[:5]) + + def test_filter_daterange_only(self): + set2 = TsDistributor(date_ranges=[ + (datetime(2004, 12, 20), datetime(2005, 2, 10, 11)), + (datetime(2005, 2, 27), datetime(2005, 3, 1, 12)), + ]) + + d = set2.select(self.df) + assert datetime(2005, 2, 1, 12) in d.index + assert len(d.index) == (12 + (10 - 1)) + 3 + + def test_filter_yearless_dates_only(self): + yearless_dates = ( + YearlessDatetime(6, 6, 12, 0, 0), + YearlessDatetime(2, 29, 12, 0, 0), + YearlessDatetime.from_datetime(datetime(2000, 5, 5, 12)), + YearlessDatetime(1, 1, 12), # not in input/output !! + ) + set3 = TsDistributor(yearless_dates=yearless_dates) + + d = set3.select(self.df) + assert datetime(2005, 5, 5, 12) in d.index + assert datetime(2008, 2, 29, 12) in d.index + assert len(d.index) == 2 * len(np.unique(self.df.index.year)) + 3 + + def test_filter_yearless_date_ranges_only(self): + set4 = TsDistributor(yearless_date_ranges=[ + (YearlessDatetime(12, 20), + YearlessDatetime(2, 10, 0)), # 12 + 9 elements + (YearlessDatetime(2, 27), YearlessDatetime(2, 29, 12)) # 3 or 2 + ]) + d = set4.select(self.df) + ny = len(np.unique(self.df.index.year)) + assert datetime(2007, 12, 21, 12) in d.index + assert datetime(2004, 2, 29, 12) in d.index + assert len(d.index) == (12 * ny) + (9 * (ny - 1)) + (3 * ny - 7) + + def test_filter_all_in_one(self): + dates = ( + datetime(2005, 4, 4, 12), + datetime(2005, 5, 5, 12), + ) + date_ranges = [(datetime(2004, 4, 6), datetime(2004, 4, 8, 12))] + yearless_dates = [YearlessDatetime(4, 10, 12)] + yearless_date_ranges = [(YearlessDatetime(2, 27), + YearlessDatetime(2, 29, 23))] + + set = TsDistributor( + dates=dates, + yearless_dates=yearless_dates, + date_ranges=date_ranges, + yearless_date_ranges=yearless_date_ranges, + ) + d = set.select(self.df) + + assert np.all(dt in d.index for dt in dates) + for dt in [ + datetime(2004, 4, 6, 12), + datetime(2004, 4, 7, 12), + datetime(2004, 4, 8, 12) + ]: + assert dt in d.index + + assert datetime(2008, 4, 10, 12) in d.index + + for dt in [ + datetime(2007, 2, 27, 12), + datetime(2007, 2, 28, 12), + datetime(2008, 2, 29, 12) + ]: + assert dt in d.index