Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/correct persistent qc #281

Merged
merged 3 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/process_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.10"
- name: Checkout repo
uses: actions/checkout@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python_version: ['3.8','3.9','3.10']
python_version: ['3.10', '3.11']
steps:
- name: Install Python
uses: actions/setup-python@v4
Expand Down
75 changes: 49 additions & 26 deletions src/pypromice/qc/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,40 @@
"persistence_qc",
"find_persistent_regions",
"count_consecutive_persistent_values",
"count_consecutive_true",
"get_duration_consecutive_true",
]

logger = logging.getLogger(__name__)

# period is given in hours, 2 persistent 10 min values will be flagged if period < 0.333
DEFAULT_VARIABLE_THRESHOLDS = {
"t": {"max_diff": 0.0001, "period": 2},
"p": {"max_diff": 0.0001, "period": 2},
'gps_lat_lon':{"max_diff": 0.000001, "period": 6}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
'gps_alt':{"max_diff": 0.0001, "period": 6},
't_rad':{"max_diff": 0.0001, "period": 2},
"rh": {"max_diff": 0.0001, "period": 2}, # gets special handling to allow constant 100%
"wspd": {"max_diff": 0.0001, "period": 6},
"t_i": {"max_diff": 0.0001, "period": 2},
"t_u": {"max_diff": 0.0001, "period": 2},
"t_l": {"max_diff": 0.0001, "period": 2},
"p_i": {"max_diff": 0.0001, "period": 2},
# "p_u": {"max_diff": 0.0001, "period": 2},
# "p_l": {"max_diff": 0.0001, "period": 2},
"gps_lat_lon": {
"max_diff": 0.000001,
"period": 6,
}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
"gps_alt": {"max_diff": 0.0001, "period": 6},
"t_rad": {"max_diff": 0.0001, "period": 2},
"rh_i": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_u": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_l": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"wspd_i": {"max_diff": 0.0001, "period": 6},
"wspd_u": {"max_diff": 0.0001, "period": 6},
"wspd_l": {"max_diff": 0.0001, "period": 6},
}


Expand Down Expand Up @@ -65,7 +85,7 @@ def persistence_qc(
logger.info(f"Running persistence_qc using {variable_thresholds}")

for k in variable_thresholds.keys():
if k in ['t','p','rh','wspd','wdir', 'z_boom']:
if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]:
var_all = [
k + "_u",
k + "_l",
Expand All @@ -79,29 +99,28 @@ def persistence_qc(
for v in var_all:
if v in df:
mask = find_persistent_regions(df[v], period, max_diff)
if 'rh' in v:
mask = mask & (df[v]<99)
if "rh" in v:
mask = mask & (df[v] < 99)
n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, v] = np.nan
elif v == 'gps_lat_lon':
mask = (
find_persistent_regions(df['gps_lon'], period, max_diff)
& find_persistent_regions(df['gps_lat'], period, max_diff)
)
elif v == "gps_lat_lon":
mask = find_persistent_regions(
df["gps_lon"], period, max_diff
) & find_persistent_regions(df["gps_lat"], period, max_diff)

n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, 'gps_lon'] = np.nan
df.loc[mask, 'gps_lat'] = np.nan
df.loc[mask, "gps_lon"] = np.nan
df.loc[mask, "gps_lat"] = np.nan

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
Expand Down Expand Up @@ -133,19 +152,21 @@ def count_consecutive_persistent_values(
) -> pd.Series:
diff = data.ffill().diff().abs() # forward filling all NaNs!
mask: pd.Series = diff < max_diff
return duration_consecutive_true(mask)
return get_duration_consecutive_true(mask)


def duration_consecutive_true(
def get_duration_consecutive_true(
series: pd.Series,
) -> pd.Series:
"""
From a boolean series, calculates the duration, in hours, of the periods with connective true values.
From a boolean series, calculates the duration, in hours, of the periods with concecutive true values.

The first value will be set to NaN, as it is not possible to calculate the duration of a single value.

Examples
--------
>>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1])
>>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1])

Parameters
----------
Expand All @@ -158,9 +179,11 @@ def duration_consecutive_true(
Integer pandas Series or DataFrame with values representing the number of connective true values.

"""
# assert series.dtype == bool
cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index)
is_first = series.astype("int").diff() == 1
offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0)
delta_time = (series.index.diff().total_seconds() / 3600).to_series(
index=series.index
)
cumsum = delta_time.cumsum()
offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0)

return (cumsum - offset) * series
Empty file added tests/unit/qc/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import unittest

import numpy as np
import numpy.testing
import pandas as pd

from pypromice.qc import persistence
from pypromice.qc.persistence import find_persistent_regions


Expand Down Expand Up @@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_no_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -46,7 +48,9 @@ def test_no_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_longer_than_period_threshold(self):
time_range = pd.date_range(
Expand All @@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_period_threshold_longer_than_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_at_the_end(self):
time_range = pd.date_range(
Expand All @@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_dont_filter_nan_values(self):
time_range = pd.date_range(
Expand All @@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_series_with_nan_values_between_persistent_values(self):
time_range = pd.date_range(
Expand All @@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self):

np.testing.assert_equal(expected_mask, output_mask)

def test_get_duration_consecutive_true(self):
delta_time_hours = np.random.random(24) * 2
time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta(
delta_time_hours.cumsum(), unit="h"
)
values = time_range == False
values[0:2] = True
values[6] = True
values[10:14] = True
values[-3:] = True
series = pd.Series(index=time_range, data=values)

duration_consecutive_true = persistence.get_duration_consecutive_true(series)

self.assertTrue(
np.isnan(duration_consecutive_true[0]), "The first index should be ignored"
)
np.testing.assert_almost_equal(
duration_consecutive_true[1],
delta_time_hours[1],
)
np.testing.assert_almost_equal(
duration_consecutive_true[6],
delta_time_hours[6],
)
np.testing.assert_almost_equal(
duration_consecutive_true[10:14],
delta_time_hours[10:14].cumsum(),
)
np.testing.assert_almost_equal(
duration_consecutive_true[-3:],
delta_time_hours[-3:].cumsum(),
)


if __name__ == "__main__":
unittest.main()
Loading