Skip to content

Commit

Permalink
Merge branch 'develop' into feature/surface-heights-and-thermistor-de…
Browse files Browse the repository at this point in the history
…pths
  • Loading branch information
PennyHow authored Aug 8, 2024
2 parents 51194de + 37527b1 commit b4becc1
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/process_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.10"
- name: Checkout repo
uses: actions/checkout@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python_version: ['3.10','3.11']
python_version: ['3.10', '3.11']
steps:
- name: Install Python
uses: actions/setup-python@v4
Expand Down
75 changes: 49 additions & 26 deletions src/pypromice/qc/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,40 @@
"persistence_qc",
"find_persistent_regions",
"count_consecutive_persistent_values",
"count_consecutive_true",
"get_duration_consecutive_true",
]

logger = logging.getLogger(__name__)

# period is given in hours, 2 persistent 10 min values will be flagged if period < 0.333
DEFAULT_VARIABLE_THRESHOLDS = {
"t": {"max_diff": 0.0001, "period": 2},
"p": {"max_diff": 0.0001, "period": 2},
'gps_lat_lon':{"max_diff": 0.000001, "period": 6}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
'gps_alt':{"max_diff": 0.0001, "period": 6},
't_rad':{"max_diff": 0.0001, "period": 2},
"rh": {"max_diff": 0.0001, "period": 2}, # gets special handling to allow constant 100%
"wspd": {"max_diff": 0.0001, "period": 6},
"t_i": {"max_diff": 0.0001, "period": 2},
"t_u": {"max_diff": 0.0001, "period": 2},
"t_l": {"max_diff": 0.0001, "period": 2},
"p_i": {"max_diff": 0.0001, "period": 2},
# "p_u": {"max_diff": 0.0001, "period": 2},
# "p_l": {"max_diff": 0.0001, "period": 2},
"gps_lat_lon": {
"max_diff": 0.000001,
"period": 6,
}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
"gps_alt": {"max_diff": 0.0001, "period": 6},
"t_rad": {"max_diff": 0.0001, "period": 2},
"rh_i": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_u": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_l": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"wspd_i": {"max_diff": 0.0001, "period": 6},
"wspd_u": {"max_diff": 0.0001, "period": 6},
"wspd_l": {"max_diff": 0.0001, "period": 6},
}


Expand Down Expand Up @@ -65,7 +85,7 @@ def persistence_qc(
logger.info(f"Running persistence_qc using {variable_thresholds}")

for k in variable_thresholds.keys():
if k in ['t','p','rh','wspd','wdir', 'z_boom']:
if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]:
var_all = [
k + "_u",
k + "_l",
Expand All @@ -79,29 +99,28 @@ def persistence_qc(
for v in var_all:
if v in df:
mask = find_persistent_regions(df[v], period, max_diff)
if 'rh' in v:
mask = mask & (df[v]<99)
if "rh" in v:
mask = mask & (df[v] < 99)
n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, v] = np.nan
elif v == 'gps_lat_lon':
mask = (
find_persistent_regions(df['gps_lon'], period, max_diff)
& find_persistent_regions(df['gps_lat'], period, max_diff)
)
elif v == "gps_lat_lon":
mask = find_persistent_regions(
df["gps_lon"], period, max_diff
) & find_persistent_regions(df["gps_lat"], period, max_diff)

n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, 'gps_lon'] = np.nan
df.loc[mask, 'gps_lat'] = np.nan
df.loc[mask, "gps_lon"] = np.nan
df.loc[mask, "gps_lat"] = np.nan

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
Expand Down Expand Up @@ -133,19 +152,21 @@ def count_consecutive_persistent_values(
) -> pd.Series:
diff = data.ffill().diff().abs() # forward filling all NaNs!
mask: pd.Series = diff < max_diff
return duration_consecutive_true(mask)
return get_duration_consecutive_true(mask)


def duration_consecutive_true(
def get_duration_consecutive_true(
series: pd.Series,
) -> pd.Series:
"""
From a boolean series, calculates the duration, in hours, of the periods with connective true values.
From a boolean series, calculates the duration, in hours, of the periods with concecutive true values.
The first value will be set to NaN, as it is not possible to calculate the duration of a single value.
Examples
--------
>>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1])
>>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1])
Parameters
----------
Expand All @@ -158,9 +179,11 @@ def duration_consecutive_true(
Integer pandas Series or DataFrame with values representing the number of connective true values.
"""
# assert series.dtype == bool
cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index)
is_first = series.astype("int").diff() == 1
offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0)
delta_time = (series.index.diff().total_seconds() / 3600).to_series(
index=series.index
)
cumsum = delta_time.cumsum()
offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0)

return (cumsum - offset) * series
Empty file added tests/unit/qc/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import unittest

import numpy as np
import numpy.testing
import pandas as pd

from pypromice.qc import persistence
from pypromice.qc.persistence import find_persistent_regions


Expand Down Expand Up @@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_no_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -46,7 +48,9 @@ def test_no_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_longer_than_period_threshold(self):
time_range = pd.date_range(
Expand All @@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_period_threshold_longer_than_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_at_the_end(self):
time_range = pd.date_range(
Expand All @@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_dont_filter_nan_values(self):
time_range = pd.date_range(
Expand All @@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_series_with_nan_values_between_persistent_values(self):
time_range = pd.date_range(
Expand All @@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self):

np.testing.assert_equal(expected_mask, output_mask)

def test_get_duration_consecutive_true(self):
delta_time_hours = np.random.random(24) * 2
time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta(
delta_time_hours.cumsum(), unit="h"
)
values = time_range == False
values[0:2] = True
values[6] = True
values[10:14] = True
values[-3:] = True
series = pd.Series(index=time_range, data=values)

duration_consecutive_true = persistence.get_duration_consecutive_true(series)

self.assertTrue(
np.isnan(duration_consecutive_true[0]), "The first index should be ignored"
)
np.testing.assert_almost_equal(
duration_consecutive_true[1],
delta_time_hours[1],
)
np.testing.assert_almost_equal(
duration_consecutive_true[6],
delta_time_hours[6],
)
np.testing.assert_almost_equal(
duration_consecutive_true[10:14],
delta_time_hours[10:14].cumsum(),
)
np.testing.assert_almost_equal(
duration_consecutive_true[-3:],
delta_time_hours[-3:].cumsum(),
)


if __name__ == "__main__":
unittest.main()

0 comments on commit b4becc1

Please sign in to comment.