-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #180 from GEUS-Glaciology-and-Climate/features/aut…
…o_qc Features/auto qc
- Loading branch information
Showing
8 changed files
with
543 additions
and
250 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import logging | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
from typing import Mapping, Optional, Union | ||
|
||
__all__ = [ | ||
"apply_static_qc", | ||
"find_static_regions", | ||
"count_consecutive_static_values", | ||
"count_consecutive_true", | ||
] | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
DEFAULT_VARIABLE_THRESHOLDS = { | ||
"t": {"max_diff": 0.0001, "period": 2}, | ||
"p": {"max_diff": 0.0001, "period": 2}, | ||
# Relative humidity can be very stable around 100%. | ||
#"rh": {"max_diff": 0.0001, "period": 2}, | ||
} | ||
|
||
|
||
def apply_static_qc( | ||
ds: xr.Dataset, | ||
variable_thresholds: Optional[Mapping] = None, | ||
) -> xr.Dataset: | ||
""" | ||
Detect and filter data points that seems to be static within a certain period. | ||
TODO: It could be nice to have a reference to the logger or description of the behaviour here. | ||
The AWS logger program is know to return the last successfully read value if it fails reading from the sensor. | ||
Parameters | ||
---------- | ||
ds : xr.Dataset | ||
Level 1 datset | ||
variable_thresholds : Mapping | ||
Define threshold dict to hold limit values, and the difference values. | ||
Limit values indicate how much a variable has to change to the previous value | ||
period is how many hours a value can stay the same without being set to NaN | ||
* are used to calculate and define all limits, which are then applied to *_u, *_l and *_i | ||
Returns | ||
------- | ||
ds_out : xr.Dataset | ||
Level 1 dataset with difference outliers set to NaN | ||
""" | ||
|
||
# the differenceQC is not done on the Windspeed | ||
# Optionally examine flagged data by setting make_plots to True | ||
# This is best done by running aws.py directly and setting 'test_station' | ||
# Plots will be shown before and after flag removal for each var | ||
|
||
df = ds.to_dataframe() # Switch to pandas | ||
|
||
if variable_thresholds is None: | ||
variable_thresholds = DEFAULT_VARIABLE_THRESHOLDS | ||
|
||
logger.debug(f"Running apply_static_qc using {variable_thresholds}") | ||
|
||
for k in variable_thresholds.keys(): | ||
var_all = [ | ||
k + "_u", | ||
k + "_l", | ||
k + "_i", | ||
] # apply to upper, lower boom, and instant | ||
max_diff = variable_thresholds[k]["max_diff"] # loading static limit | ||
period = variable_thresholds[k]["period"] # loading diff period | ||
|
||
for v in var_all: | ||
if v in df: | ||
mask = find_static_regions(df[v], period, max_diff) | ||
n_masked = mask.sum() | ||
n_samples = len(mask) | ||
logger.debug( | ||
f"Applying static QC in {v}. Filtering {n_masked}/{n_samples} samples" | ||
) | ||
# setting outliers to NaN | ||
df.loc[mask, v] = np.nan | ||
|
||
# Back to xarray, and re-assign the original attrs | ||
ds_out = df.to_xarray() | ||
ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs | ||
for x in ds_out.data_vars: # variable-specific attrs | ||
ds_out[x].attrs = ds[x].attrs | ||
|
||
return ds_out | ||
|
||
|
||
def find_static_regions( | ||
data: pd.Series, | ||
min_repeats: int, | ||
max_diff: float, | ||
) -> pd.Series: | ||
""" | ||
Algorithm that ensures values can stay the same within the outliers_mask | ||
""" | ||
consecutive_true_df = count_consecutive_static_values(data, max_diff) | ||
static_regions = consecutive_true_df >= min_repeats | ||
# Ignore entries which already nan in the input data | ||
static_regions[data.isna()] = False | ||
return static_regions | ||
|
||
|
||
def count_consecutive_static_values( | ||
data: pd.Series, | ||
max_diff: float, | ||
) -> pd.Series: | ||
diff = data.ffill().diff().abs() # forward filling all NaNs! | ||
mask: pd.Series = diff < max_diff | ||
return count_consecutive_true(mask) | ||
|
||
|
||
def count_consecutive_true( | ||
series: Union[pd.Series, pd.DataFrame] | ||
) -> Union[pd.Series, pd.DataFrame]: | ||
""" | ||
Convert boolean series to integer series where the values represent the number of connective true values. | ||
Examples | ||
-------- | ||
>>> count_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True])) | ||
pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1]) | ||
Parameters | ||
---------- | ||
series | ||
Boolean pandas Series or DataFrame | ||
Returns | ||
------- | ||
consecutive_true_count | ||
Integer pandas Series or DataFrame with values representing the number of connective true values. | ||
""" | ||
# assert series.dtype == bool | ||
cumsum = series.cumsum() | ||
is_first = series.astype("int").diff() == 1 | ||
offset = (is_first * cumsum).replace(0, np.nan).fillna(method="ffill").fillna(0) | ||
return ((cumsum - offset + 1) * series).astype("int") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import unittest | ||
|
||
import numpy as np | ||
import numpy.testing | ||
import pandas as pd | ||
|
||
from pypromice.qc.static_qc import find_static_regions | ||
|
||
|
||
class StaticQATestCase(unittest.TestCase): | ||
def test_1_hour_static(self): | ||
self._test_1_hour_repeat(10) | ||
|
||
def test_1_hour_second_index(self): | ||
self._test_1_hour_repeat(0) | ||
|
||
def test_1_hour_last_index(self): | ||
self._test_1_hour_repeat(-2) | ||
|
||
def _test_1_hour_repeat(self, index: int): | ||
self.assertTrue(index < -1 or index >= 0) | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left" | ||
) | ||
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) | ||
input_series[index + 1] = input_series[index] | ||
min_repeats = 1 | ||
expected_output = input_series.map(lambda _: False) | ||
expected_output[index + 1] = True | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_no_static_period(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left" | ||
) | ||
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) | ||
min_repeats = 1 | ||
expected_output = input_series.map(lambda _: False) | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_static_period_longer_than_period_threshold(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left" | ||
) | ||
index_start = 23 | ||
index_end = 33 | ||
min_repeats = 4 | ||
expected_filter_start = 27 | ||
expected_filter_end = 33 | ||
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) | ||
input_series[index_start:index_end] = input_series[index_start] | ||
expected_output = input_series.map(lambda _: False) | ||
expected_output[expected_filter_start:expected_filter_end] = True | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_period_threshold_longer_than_static_period(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left" | ||
) | ||
index_start = 23 | ||
index_end = 27 | ||
min_repeats = 10 | ||
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) | ||
input_series[index_start:index_end] = input_series[index_start] | ||
expected_output = input_series.map(lambda _: False) | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_static_period_at_the_end(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-28", freq="h", tz="utc", inclusive="left" | ||
) | ||
index_start = 23 | ||
min_repeats = 4 | ||
expected_filter_start = 27 | ||
input_series = pd.Series(index=time_range, data=np.arange(0, len(time_range))) | ||
input_series[index_start:] = input_series[index_start] | ||
expected_output = input_series.map(lambda _: False) | ||
expected_output[expected_filter_start:] = True | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_dont_filter_nan_values(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left" | ||
) | ||
input_series = pd.Series( | ||
index=time_range, data=np.zeros_like(time_range, dtype="float") | ||
) | ||
min_repeats = 4 | ||
input_series[:] = np.nan | ||
input_series[9] = -11 | ||
input_series[10:12] = -10 | ||
input_series[15] = -9 | ||
# There are >=4 repeats if the nan values are forward filled. [10:15] == -10 | ||
# The output mask shouldn't filter nan values. | ||
expected_output = input_series.map(lambda _: False) | ||
|
||
static_mask = find_static_regions( | ||
input_series, min_repeats=min_repeats, max_diff=0.001 | ||
) | ||
|
||
pd.testing.assert_series_equal(expected_output, static_mask, check_names=False) | ||
|
||
def test_series_with_nan_values_between_static_values(self): | ||
time_range = pd.date_range( | ||
start="2023-01-26", end="2023-01-27", freq="h", tz="utc", inclusive="left" | ||
) | ||
values = np.zeros_like(time_range, dtype="float") | ||
values[:] = np.nan | ||
values[9] = -11 | ||
values[16] = -11 | ||
values[17] = -9 | ||
series = pd.Series(index=time_range, data=values) | ||
expected_mask = np.zeros_like(values, dtype="bool") | ||
period = 4 | ||
# The value and index 16 is the same as 9 which is longer than period | ||
# Note: The station region mask shall not filter nan values | ||
expected_mask[16] = True | ||
|
||
output_mask = find_static_regions(series, min_repeats=period, max_diff=0.01) | ||
|
||
np.testing.assert_equal(expected_mask, output_mask) |