diff --git a/.github/workflows/process_test.yml b/.github/workflows/process_test.yml index 20e6a3ca..e5bf3362 100644 --- a/.github/workflows/process_test.yml +++ b/.github/workflows/process_test.yml @@ -11,7 +11,7 @@ jobs: - name: Install Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.10" - name: Checkout repo uses: actions/checkout@v3 with: diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 9664bccb..14aa27e8 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python_version: ['3.10','3.11'] + python_version: ['3.10', '3.11'] steps: - name: Install Python uses: actions/setup-python@v4 diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py index 963ff786..82fe6df8 100644 --- a/src/pypromice/qc/persistence.py +++ b/src/pypromice/qc/persistence.py @@ -9,20 +9,40 @@ "persistence_qc", "find_persistent_regions", "count_consecutive_persistent_values", - "count_consecutive_true", + "get_duration_consecutive_true", ] logger = logging.getLogger(__name__) # period is given in hours, 2 persistent 10 min values will be flagged if period < 0.333 DEFAULT_VARIABLE_THRESHOLDS = { - "t": {"max_diff": 0.0001, "period": 2}, - "p": {"max_diff": 0.0001, "period": 2}, - 'gps_lat_lon':{"max_diff": 0.000001, "period": 6}, # gets special handling to remove simultaneously constant gps_lat and gps_lon - 'gps_alt':{"max_diff": 0.0001, "period": 6}, - 't_rad':{"max_diff": 0.0001, "period": 2}, - "rh": {"max_diff": 0.0001, "period": 2}, # gets special handling to allow constant 100% - "wspd": {"max_diff": 0.0001, "period": 6}, + "t_i": {"max_diff": 0.0001, "period": 2}, + "t_u": {"max_diff": 0.0001, "period": 2}, + "t_l": {"max_diff": 0.0001, "period": 2}, + "p_i": {"max_diff": 0.0001, "period": 2}, + # "p_u": {"max_diff": 0.0001, "period": 2}, + # "p_l": {"max_diff": 0.0001, "period": 2}, + "gps_lat_lon": { + "max_diff": 0.000001, + "period": 6, + }, # gets special handling to remove simultaneously constant gps_lat and gps_lon + "gps_alt": {"max_diff": 0.0001, "period": 6}, + "t_rad": {"max_diff": 0.0001, "period": 2}, + "rh_i": { + "max_diff": 0.0001, + "period": 2, + }, # gets special handling to allow constant 100% + "rh_u": { + "max_diff": 0.0001, + "period": 2, + }, # gets special handling to allow constant 100% + "rh_l": { + "max_diff": 0.0001, + "period": 2, + }, # gets special handling to allow constant 100% + "wspd_i": {"max_diff": 0.0001, "period": 6}, + "wspd_u": {"max_diff": 0.0001, "period": 6}, + "wspd_l": {"max_diff": 0.0001, "period": 6}, } @@ -65,7 +85,7 @@ def persistence_qc( logger.info(f"Running persistence_qc using {variable_thresholds}") for k in variable_thresholds.keys(): - if k in ['t','p','rh','wspd','wdir', 'z_boom']: + if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]: var_all = [ k + "_u", k + "_l", @@ -79,8 +99,8 @@ def persistence_qc( for v in var_all: if v in df: mask = find_persistent_regions(df[v], period, max_diff) - if 'rh' in v: - mask = mask & (df[v]<99) + if "rh" in v: + mask = mask & (df[v] < 99) n_masked = mask.sum() n_samples = len(mask) logger.debug( @@ -88,11 +108,10 @@ def persistence_qc( ) # setting outliers to NaN df.loc[mask, v] = np.nan - elif v == 'gps_lat_lon': - mask = ( - find_persistent_regions(df['gps_lon'], period, max_diff) - & find_persistent_regions(df['gps_lat'], period, max_diff) - ) + elif v == "gps_lat_lon": + mask = find_persistent_regions( + df["gps_lon"], period, max_diff + ) & find_persistent_regions(df["gps_lat"], period, max_diff) n_masked = mask.sum() n_samples = len(mask) @@ -100,8 +119,8 @@ def persistence_qc( f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples" ) # setting outliers to NaN - df.loc[mask, 'gps_lon'] = np.nan - df.loc[mask, 'gps_lat'] = np.nan + df.loc[mask, "gps_lon"] = np.nan + df.loc[mask, "gps_lat"] = np.nan # Back to xarray, and re-assign the original attrs ds_out = df.to_xarray() @@ -133,19 +152,21 @@ def count_consecutive_persistent_values( ) -> pd.Series: diff = data.ffill().diff().abs() # forward filling all NaNs! mask: pd.Series = diff < max_diff - return duration_consecutive_true(mask) + return get_duration_consecutive_true(mask) -def duration_consecutive_true( +def get_duration_consecutive_true( series: pd.Series, ) -> pd.Series: """ - From a boolean series, calculates the duration, in hours, of the periods with connective true values. + From a boolean series, calculates the duration, in hours, of the periods with concecutive true values. + + The first value will be set to NaN, as it is not possible to calculate the duration of a single value. Examples -------- - >>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True])) - pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1]) + >>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True])) + pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1]) Parameters ---------- @@ -158,9 +179,11 @@ def duration_consecutive_true( Integer pandas Series or DataFrame with values representing the number of connective true values. """ - # assert series.dtype == bool - cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index) is_first = series.astype("int").diff() == 1 - offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0) + delta_time = (series.index.diff().total_seconds() / 3600).to_series( + index=series.index + ) + cumsum = delta_time.cumsum() + offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0) return (cumsum - offset) * series diff --git a/tests/unit/qc/__init__.py b/tests/unit/qc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/pypromice/qc/persistence_test.py b/tests/unit/qc/test_persistence.py similarity index 73% rename from src/pypromice/qc/persistence_test.py rename to tests/unit/qc/test_persistence.py index 5cd3d928..d343b0bc 100644 --- a/src/pypromice/qc/persistence_test.py +++ b/tests/unit/qc/test_persistence.py @@ -1,9 +1,9 @@ import unittest import numpy as np -import numpy.testing import pandas as pd +from pypromice.qc import persistence from pypromice.qc.persistence import find_persistent_regions @@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_no_persistent_period(self): time_range = pd.date_range( @@ -46,7 +48,9 @@ def test_no_persistent_period(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_persistent_period_longer_than_period_threshold(self): time_range = pd.date_range( @@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_period_threshold_longer_than_persistent_period(self): time_range = pd.date_range( @@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_persistent_period_at_the_end(self): time_range = pd.date_range( @@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_dont_filter_nan_values(self): time_range = pd.date_range( @@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_series_with_nan_values_between_persistent_values(self): time_range = pd.date_range( @@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self): np.testing.assert_equal(expected_mask, output_mask) + def test_get_duration_consecutive_true(self): + delta_time_hours = np.random.random(24) * 2 + time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta( + delta_time_hours.cumsum(), unit="h" + ) + values = time_range == False + values[0:2] = True + values[6] = True + values[10:14] = True + values[-3:] = True + series = pd.Series(index=time_range, data=values) + + duration_consecutive_true = persistence.get_duration_consecutive_true(series) + + self.assertTrue( + np.isnan(duration_consecutive_true[0]), "The first index should be ignored" + ) + np.testing.assert_almost_equal( + duration_consecutive_true[1], + delta_time_hours[1], + ) + np.testing.assert_almost_equal( + duration_consecutive_true[6], + delta_time_hours[6], + ) + np.testing.assert_almost_equal( + duration_consecutive_true[10:14], + delta_time_hours[10:14].cumsum(), + ) + np.testing.assert_almost_equal( + duration_consecutive_true[-3:], + delta_time_hours[-3:].cumsum(), + ) + if __name__ == "__main__": unittest.main()