Skip to content

Commit

Permalink
Updated difference_qc
Browse files Browse the repository at this point in the history
* Added unit tests for testing boundary conditions
* Use max aggregation instead of sum to detect determine static data in window.
* Modifying the loop to fix issue where the last index wasn't processed.
  • Loading branch information
ladsmund committed Sep 8, 2023
1 parent 59ecf4f commit bce1098
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 14 deletions.
31 changes: 17 additions & 14 deletions src/pypromice/qc/difference.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def differenceQC(ds: xr.Dataset) -> xr.Dataset:

for v in var_all:
if v in df:
outliers = find_stationary_values(df[v], diff_period, static_limit)
df.loc[outliers.index, v] = np.nan # setting outliers to NaN
mask = find_static_regions(df[v], diff_period, static_limit)
# setting outliers to NaN
df.loc[mask, v] = np.nan

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
Expand All @@ -58,18 +59,20 @@ def differenceQC(ds: xr.Dataset) -> xr.Dataset:
return ds_out


def find_stationary_values(
data: pd.Series,
diff_period: int,
static_limit: float,
def find_static_regions(
data: pd.Series,
diff_period: int,
static_limit: float,
) -> pd.Series:
diff = data.diff()
diff.fillna(method='ffill', inplace=True) # forward filling all NaNs!
"""
Algorithm that ensures values can stay the same within the outliers_mask
"""
diff = data.diff().fillna(method="ffill").abs() # forward filling all NaNs!
# Indexing is significantly faster on numpy arrays that pandas series
diff = np.array(diff)
outliers_mask = np.zeros_like(diff, dtype=bool)
for i, d in enumerate(diff): # algorithm that ensures values can stay the same within the outliers_mask
if i > (diff_period - 1):
if sum(abs(diff[i - diff_period:i])) < static_limit:
outliers_mask[i - diff_period:i] = True
outliers = data[outliers_mask] # finding outliers in dataframe
return outliers
for i in range(len(outliers_mask) - diff_period + 1):
i_end = i + diff_period
if max(diff[i:i_end]) < static_limit:
outliers_mask[i:i_end] = True
return pd.Series(index=data.index, data=outliers_mask)
142 changes: 142 additions & 0 deletions src/pypromice/qc/difference_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import datetime
import random
import unittest
from typing import Union

import numpy as np
import pandas as pd

from pypromice.qc.difference import find_static_regions


def get_random_datetime() -> datetime.datetime:
# Select random timestamp in the period 1970-2030
seconds_per_year = 365.25 * 24 * 3600
timestamp = 60 * seconds_per_year * random.random()
return datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc)


def get_random_timeseries(
start: Union[str, datetime.datetime],
period: datetime.timedelta = datetime.timedelta(days=5),
freq="1h",
offset: float = -20,
amplitude: float = 5,
) -> pd.Series:
end = start + period
time_range = pd.date_range(start=start, end=end, freq=freq, tz="utc")
steps_per_day = "1d" / time_range.freq
phase = np.random.random() * 2 * np.pi
x = 2 * np.pi * np.arange(len(time_range)) / steps_per_day
data = amplitude * np.sin(x + phase) + offset
return pd.Series(index=time_range, data=data, name="data")


class DifferenceQATestCase(unittest.TestCase):
def test_1_hour_static(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=5),
freq="1h",
)
index = 24
series.iloc[index] = series.iloc[index - 1]

mask = find_static_regions(series, diff_period=1, static_limit=0.001)

self.assertEqual(1, mask.sum())
self.assertTrue(1, mask.iloc[index])

def test_1_hour_second_index(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=5),
freq="1h",
)
index = 1
series.iloc[index] = series.iloc[index - 1]

mask = find_static_regions(series, diff_period=1, static_limit=0.001)

self.assertEqual(1, mask.sum())
self.assertTrue(1, mask.iloc[index])

def test_1_hour_last_index(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=5),
freq="1h",
)
index = -1
series.iloc[index] = series.iloc[index - 1]

mask = find_static_regions(series, diff_period=1, static_limit=0.001)

self.assertEqual(1, mask.sum())
self.assertTrue(1, mask.iloc[index])

def test_no_static_period(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=5),
freq="1h",
)

static_mask = find_static_regions(series, diff_period=1, static_limit=0.001)

pd.testing.assert_series_equal(
pd.Series(index=static_mask.index, data=False),
static_mask,
check_names=False,
)

def test_static_period_longer_than_diff(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=100),
freq="1h",
)
index_start = 41
index_length = 30
index_end = index_start + index_length
series.iloc[index_start:index_end] = series.iloc[index_start - 1]

static_mask = find_static_regions(series, diff_period=24, static_limit=0.001)

self.assertEqual(
index_length,
static_mask.sum(),
)
self.assertEqual(
index_length,
static_mask.iloc[index_start:index_end].sum(),
)

def test_diff_period_longer_than_static_period(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=100),
freq="1h",
)
index_start = 41
index_length = 30
index_end = index_start + index_length
series.iloc[index_start:index_end] = series.iloc[index_start - 1]

static_mask = find_static_regions(series, diff_period=31, static_limit=0.001)

self.assertEqual(0, static_mask.sum())

def test_static_period_at_the_end(self):
series = get_random_timeseries(
start=get_random_datetime(),
period=datetime.timedelta(days=5),
freq="1h",
)
index_length = 14
series.iloc[-index_length:] = series.iloc[-index_length - 1]

static_mask = find_static_regions(series, diff_period=10, static_limit=0.001)

self.assertEqual(index_length, static_mask.sum())
self.assertEqual(index_length, static_mask.iloc[-index_length:].sum())

0 comments on commit bce1098

Please sign in to comment.