GEUS-Glaciology-and-Climate · ladsmund · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/.github/workflows/process_test.yml b/.github/workflows/process_test.yml
@@ -11,7 +11,7 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.8"        
+          python-version: "3.10"
       - name: Checkout repo
         uses: actions/checkout@v3
         with:

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python_version: ['3.8','3.9','3.10']
+        python_version: ['3.10', '3.11']
     steps:
       - name: Install Python
         uses: actions/setup-python@v4

diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py
@@ -9,20 +9,40 @@
     "persistence_qc",
     "find_persistent_regions",
     "count_consecutive_persistent_values",
-    "count_consecutive_true",
+    "get_duration_consecutive_true",
 ]
 
 logger = logging.getLogger(__name__)
 
 # period is given in hours, 2 persistent 10 min values will be flagged if period < 0.333
 DEFAULT_VARIABLE_THRESHOLDS = {
-    "t": {"max_diff": 0.0001, "period": 2},
-    "p": {"max_diff": 0.0001, "period": 2},
-    'gps_lat_lon':{"max_diff": 0.000001, "period": 6}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
-    'gps_alt':{"max_diff": 0.0001, "period": 6},
-    't_rad':{"max_diff": 0.0001, "period": 2},
-    "rh": {"max_diff": 0.0001, "period": 2}, # gets special handling to allow constant 100%
-    "wspd": {"max_diff": 0.0001, "period": 6},
+    "t_i": {"max_diff": 0.0001, "period": 2},
+    "t_u": {"max_diff": 0.0001, "period": 2},
+    "t_l": {"max_diff": 0.0001, "period": 2},
+    "p_i": {"max_diff": 0.0001, "period": 2},
+    # "p_u": {"max_diff": 0.0001, "period": 2},
+    # "p_l": {"max_diff": 0.0001, "period": 2},
+    "gps_lat_lon": {
+        "max_diff": 0.000001,
+        "period": 6,
+    },  # gets special handling to remove simultaneously constant gps_lat and gps_lon
+    "gps_alt": {"max_diff": 0.0001, "period": 6},
+    "t_rad": {"max_diff": 0.0001, "period": 2},
+    "rh_i": {
+        "max_diff": 0.0001,
+        "period": 2,
+    },  # gets special handling to allow constant 100%
+    "rh_u": {
+        "max_diff": 0.0001,
+        "period": 2,
+    },  # gets special handling to allow constant 100%
+    "rh_l": {
+        "max_diff": 0.0001,
+        "period": 2,
+    },  # gets special handling to allow constant 100%
+    "wspd_i": {"max_diff": 0.0001, "period": 6},
+    "wspd_u": {"max_diff": 0.0001, "period": 6},
+    "wspd_l": {"max_diff": 0.0001, "period": 6},
 }
 
 
@@ -65,7 +85,7 @@ def persistence_qc(
     logger.info(f"Running persistence_qc using {variable_thresholds}")
 
     for k in variable_thresholds.keys():
-        if k in ['t','p','rh','wspd','wdir', 'z_boom']:
+        if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]:
             var_all = [
                 k + "_u",
                 k + "_l",
@@ -79,29 +99,28 @@ def persistence_qc(
         for v in var_all:
             if v in df:
                 mask = find_persistent_regions(df[v], period, max_diff)
-                if 'rh' in v:
-                    mask = mask & (df[v]<99)
+                if "rh" in v:
+                    mask = mask & (df[v] < 99)
                 n_masked = mask.sum()
                 n_samples = len(mask)
                 logger.debug(
                     f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
                 )
                 # setting outliers to NaN
                 df.loc[mask, v] = np.nan
-            elif v == 'gps_lat_lon':
-                mask = (
-                    find_persistent_regions(df['gps_lon'], period, max_diff)
-                    & find_persistent_regions(df['gps_lat'], period, max_diff) 
-                )
+            elif v == "gps_lat_lon":
+                mask = find_persistent_regions(
+                    df["gps_lon"], period, max_diff
+                ) & find_persistent_regions(df["gps_lat"], period, max_diff)
 
                 n_masked = mask.sum()
                 n_samples = len(mask)
                 logger.debug(
                     f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
                 )
                 # setting outliers to NaN
-                df.loc[mask, 'gps_lon'] = np.nan
-                df.loc[mask, 'gps_lat'] = np.nan
+                df.loc[mask, "gps_lon"] = np.nan
+                df.loc[mask, "gps_lat"] = np.nan
 
     # Back to xarray, and re-assign the original attrs
     ds_out = df.to_xarray()
@@ -133,19 +152,21 @@ def count_consecutive_persistent_values(
 ) -> pd.Series:
     diff = data.ffill().diff().abs()  # forward filling all NaNs!
     mask: pd.Series = diff < max_diff
-    return duration_consecutive_true(mask)
+    return get_duration_consecutive_true(mask)
 
 
-def duration_consecutive_true(
+def get_duration_consecutive_true(
     series: pd.Series,
 ) -> pd.Series:
     """
-    From a boolean series, calculates the duration, in hours, of the periods with connective true values.
+    From a boolean series, calculates the duration, in hours, of the periods with concecutive true values.
+
+    The first value will be set to NaN, as it is not possible to calculate the duration of a single value.
 
     Examples
     --------
-    >>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
-    pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1])
+    >>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
+    pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1])
 
     Parameters
     ----------
@@ -158,9 +179,11 @@ def duration_consecutive_true(
         Integer pandas Series or DataFrame with values representing the number of connective true values.
 
     """
-    # assert series.dtype == bool
-    cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index)
     is_first = series.astype("int").diff() == 1
-    offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0)
+    delta_time = (series.index.diff().total_seconds() / 3600).to_series(
+        index=series.index
+    )
+    cumsum = delta_time.cumsum()
+    offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0)
 
     return (cumsum - offset) * series
diff --git a/tests/unit/qc/__init__.py b/tests/unit/qc/__init__.py
diff --git a/src/pypromice/qc/persistence_test.py → tests/unit/qc/test_persistence.py b/src/pypromice/qc/persistence_test.py → tests/unit/qc/test_persistence.py
@@ -1,9 +1,9 @@
 import unittest
 
 import numpy as np
-import numpy.testing
 import pandas as pd
 
+from pypromice.qc import persistence
 from pypromice.qc.persistence import find_persistent_regions
 
 
@@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_no_persistent_period(self):
         time_range = pd.date_range(
@@ -46,7 +48,9 @@ def test_no_persistent_period(self):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_persistent_period_longer_than_period_threshold(self):
         time_range = pd.date_range(
@@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_period_threshold_longer_than_persistent_period(self):
         time_range = pd.date_range(
@@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_persistent_period_at_the_end(self):
         time_range = pd.date_range(
@@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_dont_filter_nan_values(self):
         time_range = pd.date_range(
@@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self):
             input_series, min_repeats=min_repeats, max_diff=0.001
         )
 
-        pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
+        pd.testing.assert_series_equal(
+            expected_output, persistent_mask, check_names=False
+        )
 
     def test_series_with_nan_values_between_persistent_values(self):
         time_range = pd.date_range(
@@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self):
 
         np.testing.assert_equal(expected_mask, output_mask)
 
+    def test_get_duration_consecutive_true(self):
+        delta_time_hours = np.random.random(24) * 2
+        time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta(
+            delta_time_hours.cumsum(), unit="h"
+        )
+        values = time_range == False
+        values[0:2] = True
+        values[6] = True
+        values[10:14] = True
+        values[-3:] = True
+        series = pd.Series(index=time_range, data=values)
+
+        duration_consecutive_true = persistence.get_duration_consecutive_true(series)
+
+        self.assertTrue(
+            np.isnan(duration_consecutive_true[0]), "The first index should be ignored"
+        )
+        np.testing.assert_almost_equal(
+            duration_consecutive_true[1],
+            delta_time_hours[1],
+        )
+        np.testing.assert_almost_equal(
+            duration_consecutive_true[6],
+            delta_time_hours[6],
+        )
+        np.testing.assert_almost_equal(
+            duration_consecutive_true[10:14],
+            delta_time_hours[10:14].cumsum(),
+        )
+        np.testing.assert_almost_equal(
+            duration_consecutive_true[-3:],
+            delta_time_hours[-3:].cumsum(),
+        )
+
 
 if __name__ == "__main__":
     unittest.main()