From 475805fb4093c3f2dd5ed7540c332a326e1f8151 Mon Sep 17 00:00:00 2001 From: Mads Christian Lund Date: Thu, 2 Nov 2023 16:09:25 +0100 Subject: [PATCH] Implemented outlier detector class for handling configuration state * Implemented core functionality for season based threshold filter * Implemented script to compute thresholds using historical L1 (inferred) data * Added default thresholds * Updated setup.py to allow thresholds data file --- setup.py | 1 + src/pypromice/process/L1toL2.py | 6 +- src/pypromice/qc/percentiles/__init__.py | 0 .../qc/percentiles/compute_thresholds.py | 221 +++++++++++++ .../qc/percentiles/outlier_detector.py | 114 +++++++ src/pypromice/qc/percentiles/thresholds.csv | 312 ++++++++++++++++++ src/pypromice/test/test_percentile.py | 186 +++++++++++ 7 files changed, 839 insertions(+), 1 deletion(-) create mode 100644 src/pypromice/qc/percentiles/__init__.py create mode 100644 src/pypromice/qc/percentiles/compute_thresholds.py create mode 100644 src/pypromice/qc/percentiles/outlier_detector.py create mode 100644 src/pypromice/qc/percentiles/thresholds.csv create mode 100644 src/pypromice/test/test_percentile.py diff --git a/setup.py b/setup.py index 6a7a5111..92b18e3c 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ include_package_data = True, packages=setuptools.find_packages(where="src"), python_requires=">=3.8", + package_data={"pypromice.qc.percentiles": ["thresholds.csv"]}, install_requires=['numpy>=1.23.0', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'scikit-learn>=1.1.0', 'Bottleneck', 'netcdf4', 'pyDataverse'], entry_points={ 'console_scripts': [ diff --git a/src/pypromice/process/L1toL2.py b/src/pypromice/process/L1toL2.py index 406b2bdb..a1839715 100644 --- a/src/pypromice/process/L1toL2.py +++ b/src/pypromice/process/L1toL2.py @@ -9,6 +9,7 @@ import xarray as xr from pypromice.qc.github_data_issues import flagNAN, adjustTime, adjustData +from pypromice.qc.percentiles.outlier_detector import ThresholdBasedOutlierDetector from pypromice.qc.persistence import persistence_qc from pypromice.process.value_clipping import clip_values @@ -66,7 +67,10 @@ def toL2( logger.exception('Flagging and fixing failed:') if ds.attrs['format'] == 'TX': - ds = persistence_qc(ds) # Detect and filter data points that seems to be static + ds = persistence_qc(ds) # Flag and remove persistence outliers + # TODO: The configuration should be provided explicitly + outlier_detector = ThresholdBasedOutlierDetector.default() + ds = outlier_detector.filter_data(ds) # Flag and remove percentile outliers T_100 = _getTempK(T_0) ds['rh_u_cor'] = correctHumidity(ds['rh_u'], ds['t_u'], diff --git a/src/pypromice/qc/percentiles/__init__.py b/src/pypromice/qc/percentiles/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/pypromice/qc/percentiles/compute_thresholds.py b/src/pypromice/qc/percentiles/compute_thresholds.py new file mode 100644 index 00000000..f7899472 --- /dev/null +++ b/src/pypromice/qc/percentiles/compute_thresholds.py @@ -0,0 +1,221 @@ +import sys +from datetime import datetime + +import pandas as pd + +from pypromice.process import AWS +from pathlib import Path +import logging +from pypromice.qc.github_data_issues import adjustTime, flagNAN, adjustData + + +# %% +logger = logging.getLogger("ComputeThreshold") + + +# %% +def compute_all_thresholds( + station_thresholds_root: Path, + thresholds_output_path: Path, + aws_l0_repo_path: Path, + start_time: datetime, + end_time: datetime, +): + logger.info("Computing all thresholds for stations available in the L0 repository") + logger.info(f"station_thresholds_root: {station_thresholds_root}") + logger.info(f"thresholds_output_path: {thresholds_output_path}") + logger.info(f"aws_l0_repo_path: {aws_l0_repo_path}") + logger.info(f"start_time: {start_time}") + logger.info(f"end_time: {end_time}") + + station_thresholds_root.mkdir(parents=True, exist_ok=True) + + # %% + output_paths = [] + for config_path in aws_l0_repo_path.glob("raw/config/*.toml"): + stid = config_path.stem + + logger.info(f"Processing {stid}") + data_path = aws_l0_repo_path.joinpath("raw", stid) + output_path = station_thresholds_root.joinpath(f"{stid}.csv") + try: + if not output_path.exists(): + threshold = find_thresholds( + stid, + config_path, + data_path, + start_time, + end_time, + ) + threshold.to_csv( + path_or_buf=output_path, index=False, float_format="{:.2f}".format + ) + output_paths.append(output_path) + except Exception: + logger.exception(f"Failed processing {stid}") + continue + + logger.info("Merge threshold files") + pd.concat(pd.read_csv(p) for p in output_paths).to_csv( + thresholds_output_path, index=False, float_format="{:.2f}".format + ) + + +def find_thresholds( + stid: str, + config_path: Path, + data_path: Path, + start_time: datetime, + end_time: datetime, +) -> pd.DataFrame: + """ + Compute variable threshold for a station using historical distribution quantiles. + + Parameters + ---------- + stid + config_path + data_path + start_time + end_time + + Returns + ------- + Upper and lower thresholds for a set of variables and seasons + + + """ + stid_logger = logger.getChild(stid) + # %% + + stid_logger.info("Read AWS data and get L1") + aws = AWS(config_file=config_path.as_posix(), inpath=data_path.as_posix()) + aws.getL1() + + # %% + stid_logger.info("Apply QC filters on data") + ds = aws.L1A.copy(deep=True) # Reassign dataset + ds = adjustTime(ds) # Adjust time after a user-defined csv files + ds = flagNAN(ds) # Flag NaNs after a user-defined csv files + ds = adjustData(ds) + + # %% + stid_logger.info("Determine thresholds") + df = ( + ds[["rh_u", "wspd_u", "p_u", "t_u"]] + .to_pandas() + .loc[start_time:end_time] + .assign(season=lambda df: (df.index.month // 3) % 4) + ) + + threshold_rows = [] + + # Pressure + p_lo, p_hi = df["p_u"].quantile([0.005, 0.995]) + [-12, 12] + threshold_rows.append( + dict( + stid=stid, + variable_pattern="p_[ul]", + lo=p_lo, + hi=p_hi, + ) + ) + threshold_rows.append( + dict( + stid=stid, + variable_pattern="p_i", + lo=p_lo - 1000, + hi=p_hi - 1000, + ) + ) + + # Wind speed + lo, hi = df["wspd_u"].quantile([0.005, 0.995]) + [-12, 12] + threshold_rows.append( + dict( + stid=stid, + variable_pattern="wspd_[uli]", + lo=lo, + hi=hi, + ) + ) + + # Temperature + season_map = ["winter", "spring", "summer", "fall"] + for season_index, season_df in df[["t_u", "season"]].groupby( + (df.index.month // 3) % 4 + ): + lo, hi = season_df.quantile([0.005, 0.995])["t_u"] + [-9, 9] + + threshold_rows.append( + dict( + stid=stid, + variable_pattern="t_[uli]", + season=season_map[season_index], + lo=lo, + hi=hi, + ) + ) + + threshold = pd.DataFrame(threshold_rows) + stid_logger.info(threshold) + return threshold + # %% + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--l0", + required=True, + type=Path, + help="L0 repository root path", + ) + parser.add_argument( + "--thresholds_output_path", + "-o", + default=Path(__file__).parent.joinpath("thresholds.csv"), + type=Path, + help="Output csv file with thresholds for all stations", + ) + parser.add_argument( + "--station_thresholds_root", + "--str", + default=Path(__file__).parent.joinpath("station_thresholds"), + type=Path, + help="Directory containing threshold files for the individual stations", + ) + parser.add_argument( + "--start_time", + default="2000-01-01", + help="Start time for data series. Format: %Y-%m-%d", + ) + parser.add_argument( + "--end_time", + default="2023-10-01", + help="End time for data series. Format: %Y-%m-%d", + ) + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s; %(levelname)s; %(name)s; %(message)s", + level=logging.INFO, + stream=sys.stdout, + ) + + thresholds_output_path = args.thresholds_output_path + station_thresholds_root = args.station_thresholds_root + + start_time = datetime.strptime(args.start_time, "%Y-%m-%d") + end_time = datetime.strptime(args.end_time, "%Y-%m-%d") + aws_l0_repo_path = args.l0 + + compute_all_thresholds( + station_thresholds_root=station_thresholds_root, + thresholds_output_path=thresholds_output_path, + aws_l0_repo_path=aws_l0_repo_path, + start_time=start_time, + end_time=end_time, + ) diff --git a/src/pypromice/qc/percentiles/outlier_detector.py b/src/pypromice/qc/percentiles/outlier_detector.py new file mode 100644 index 00000000..dada58b6 --- /dev/null +++ b/src/pypromice/qc/percentiles/outlier_detector.py @@ -0,0 +1,114 @@ +from pathlib import Path + +import attrs +import numpy as np +import pandas as pd +import xarray as xr + + +__all__ = [ + "ThresholdBasedOutlierDetector" +] + +season_month_map = { + "winter": {12, 1, 2}, + "spring": {3, 4, 5}, + "summer": {6, 7, 8}, + "fall": {9, 10, 11}, +} + + +def get_season_index_mask(data_set: pd.DataFrame, season: str) -> np.ndarray[bool]: + season_months = season_month_map.get( + season, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12} + ) + return data_set.index.month.isin(season_months)[:, None] + + +def detect_outliers(data_set: pd.DataFrame, thresholds: pd.DataFrame) -> pd.DataFrame: + masks = [] + + season_index_mask = { + season: get_season_index_mask(data_set, season) + for season in thresholds["season"].unique() + } + + for variable_pattern, pattern_configs in thresholds.groupby("variable_pattern"): + df = data_set.filter(regex=f"^{variable_pattern}$") + mask = None + for _, season_config in pattern_configs.iterrows(): + threshold_mask = (df < season_config.lo) | (df > season_config.hi) + season_mask = threshold_mask & season_index_mask[season_config.season] + + if mask is None: + mask = season_mask + else: + mask |= season_mask + masks.append(mask) + + return pd.concat(masks, axis=1) + + +def filter_data(data_set: pd.DataFrame, thresholds: pd.DataFrame) -> pd.DataFrame: + mask = detect_outliers(data_set, thresholds) + output_data = data_set.copy() + output_data[mask] = np.nan + return output_data + + +@attrs.define +class ThresholdBasedOutlierDetector: + thresholds: pd.DataFrame = attrs.field() + + def filter_data(self, ds: xr.Dataset) -> xr.Dataset: + """ + Filter samples across all variables by assigning to nan + """ + stid = ds.station_id + + stid_thresholds = self.thresholds.query(f"stid == '{stid}'") + if not stid_thresholds.any(): + return ds + + data_df = ds.to_dataframe() # Switch to pandas + data_df = filter_data( + data_set=data_df, + thresholds=stid_thresholds, + ) + + ds_out: xr.Dataset = data_df.to_xarray() + ds_out = ds_out.assign_attrs(ds.attrs) # Dataset attrs + for x in ds_out.data_vars: # variable-specific attrs + ds_out[x].attrs = ds[x].attrs + + return ds_out + + @classmethod + def from_csv_config(cls, config_file: Path) -> 'ThresholdBasedOutlierDetector': + """ + Instantiate using explicit csv file with explicit thresholds + + The CSV file shall have the format: + + * Comma separated + * First row is header + * Columns + * stid: Station id + * variabel_pattern: regular expression filtering the variable name + * lo: Low threshold + * hi: High threshold + * season: The season of the filter: [, winter, spring, summer, fall]. The empty string means all seasons + + """ + return cls(thresholds=pd.read_csv(config_file)) + + @classmethod + def default(cls) -> 'ThresholdBasedOutlierDetector': + """ + Instantiate using aws thresholds stored in the python package. + Returns + ------- + + """ + default_thresholds_path = Path(__file__).parent.joinpath('thresholds.csv') + return cls.from_csv_config(default_thresholds_path) diff --git a/src/pypromice/qc/percentiles/thresholds.csv b/src/pypromice/qc/percentiles/thresholds.csv new file mode 100644 index 00000000..9bf50727 --- /dev/null +++ b/src/pypromice/qc/percentiles/thresholds.csv @@ -0,0 +1,312 @@ +stid,variable_pattern,lo,hi,season +NUK_K,p_[ul],877.96,958.79, +NUK_K,p_i,-122.04,-41.21, +NUK_K,wspd_[uli],-12.00,24.74, +NUK_K,t_[uli],-34.22,11.27,winter +NUK_K,t_[uli],-33.10,16.28,spring +NUK_K,t_[uli],-12.57,21.40,summer +NUK_K,t_[uli],-27.04,15.72,fall +SWC_O,p_[ul],831.20,909.80, +SWC_O,p_i,-168.80,-90.20, +SWC_O,wspd_[uli],-11.68,30.80, +SWC_O,t_[uli],-49.84,6.70,winter +SWC_O,t_[uli],-36.90,9.20,spring +SWC_O,t_[uli],-20.90,13.40,summer +SWC_O,t_[uli],-36.82,12.00,fall +KPC_L,p_[ul],924.86,1006.19, +KPC_L,p_i,-75.14,6.19, +KPC_L,wspd_[uli],-11.89,29.23, +KPC_L,t_[uli],-45.45,2.16,winter +KPC_L,t_[uli],-44.70,9.46,spring +KPC_L,t_[uli],-16.81,16.08,summer +KPC_L,t_[uli],-39.66,9.46,fall +EGP,p_[ul],675.50,752.80, +EGP,p_i,-324.50,-247.20, +EGP,wspd_[uli],-12.00,24.10, +EGP,t_[uli],-69.45,-10.18,winter +EGP,t_[uli],-66.58,1.41,spring +EGP,t_[uli],-37.66,8.45,summer +EGP,t_[uli],-62.60,4.00,fall +CEN2,p_[ul],762.74,826.00, +CEN2,p_i,-237.26,-174.00, +CEN2,wspd_[uli],-12.00,26.81, +CEN2,t_[uli],-36.00,-6.10,winter +CEN2,t_[uli],-32.30,2.20,spring +CEN2,t_[uli],-30.50,7.60,summer +CEN2,t_[uli],-53.70,-1.60,fall +MIT,p_[ul],900.19,993.97, +MIT,p_i,-99.81,-6.03, +MIT,wspd_[uli],-12.00,28.26, +MIT,t_[uli],-29.36,10.81,winter +MIT,t_[uli],-31.05,14.54,spring +MIT,t_[uli],-11.51,19.61,summer +MIT,t_[uli],-23.66,14.59,fall +TUN,p_[ul],744.83,807.00, +TUN,p_i,-255.17,-193.00, +TUN,wspd_[uli],-11.85,24.81, +TUN,t_[uli],-51.90,-3.65,winter +TUN,t_[uli],-55.90,1.40,spring +TUN,t_[uli],-31.70,8.90,summer +TUN,t_[uli],-56.40,8.10,fall +KPC_U,p_[ul],864.79,941.47, +KPC_U,p_i,-135.21,-58.53, +KPC_U,wspd_[uli],-12.00,27.73, +KPC_U,t_[uli],-51.46,-0.09,winter +KPC_U,t_[uli],-50.44,6.79,spring +KPC_U,t_[uli],-21.11,12.77,summer +KPC_U,t_[uli],-45.74,6.19,fall +UPE_L,p_[ul],940.00,1023.48, +UPE_L,p_i,-60.00,23.48, +UPE_L,wspd_[uli],-12.00,29.37, +UPE_L,t_[uli],-40.29,11.55,winter +UPE_L,t_[uli],-40.90,15.97,spring +UPE_L,t_[uli],-13.15,19.28,summer +UPE_L,t_[uli],-29.23,15.26,fall +KAN_M,p_[ul],816.70,896.46, +KAN_M,p_i,-183.30,-103.54, +KAN_M,wspd_[uli],-12.00,31.99, +KAN_M,t_[uli],-50.27,6.21,winter +KAN_M,t_[uli],-48.60,11.85,spring +KAN_M,t_[uli],-21.57,12.59,summer +KAN_M,t_[uli],-42.77,11.79,fall +QAS_A,p_[ul],838.61,919.19, +QAS_A,p_i,-161.39,-80.81, +QAS_A,wspd_[uli],-12.00,28.55, +QAS_A,t_[uli],-36.80,3.98,winter +QAS_A,t_[uli],-36.71,17.11,spring +QAS_A,t_[uli],-15.24,18.32,summer +QAS_A,t_[uli],-31.06,13.03,fall +NSE,p_[ul],694.70,784.80, +NSE,p_i,-305.30,-215.20, +NSE,wspd_[uli],-12.00,29.18, +NSE,t_[uli],-56.60,-2.40,winter +NSE,t_[uli],-53.54,-20.79,spring +NSE,t_[uli],-29.20,11.00,summer +NSE,t_[uli],-48.70,6.40,fall +UPE_U,p_[ul],852.55,932.91, +UPE_U,p_i,-147.45,-67.09, +UPE_U,wspd_[uli],-11.93,29.59, +UPE_U,t_[uli],-45.20,6.30,winter +UPE_U,t_[uli],-45.68,13.19,spring +UPE_U,t_[uli],-18.84,14.93,summer +UPE_U,t_[uli],-36.32,11.26,fall +QAS_M,p_[ul],884.52,970.17, +QAS_M,p_i,-115.48,-29.83, +QAS_M,wspd_[uli],-12.00,29.35, +QAS_M,t_[uli],-34.30,11.12,winter +QAS_M,t_[uli],-29.77,16.09,spring +QAS_M,t_[uli],-11.57,16.11,summer +QAS_M,t_[uli],-25.79,14.85,fall +QAS_L,p_[ul],924.21,1013.41, +QAS_L,p_i,-75.79,13.41, +QAS_L,wspd_[uli],-12.00,30.08, +QAS_L,t_[uli],-30.44,15.21,winter +QAS_L,t_[uli],-29.98,17.38,spring +QAS_L,t_[uli],-9.46,18.98,summer +QAS_L,t_[uli],-23.72,17.12,fall +NUK_L,p_[ul],897.14,980.03, +NUK_L,p_i,-102.86,-19.97, +NUK_L,wspd_[uli],-12.00,27.23, +NUK_L,t_[uli],-35.82,14.42,winter +NUK_L,t_[uli],-35.35,18.04,spring +NUK_L,t_[uli],-10.63,20.76,summer +NUK_L,t_[uli],-29.31,17.66,fall +NAU,p_[ul],715.60,783.20, +NAU,p_i,-284.40,-216.80, +NAU,wspd_[uli],-11.53,28.77, +NAU,t_[uli],-58.30,-2.90,winter +NAU,t_[uli],-49.70,1.50,spring +NAU,t_[uli],-34.79,7.90,summer +NAU,t_[uli],-55.70,9.50,fall +JAR,p_[ul],881.00,929.00, +JAR,p_i,-119.00,-71.00, +JAR,wspd_[uli],-11.62,29.82, +JAR,t_[uli],-14.60,13.30,summer +FRE,p_[ul],638.31,799.82, +FRE,p_i,-361.69,-200.18, +FRE,wspd_[uli],-12.00,22.62, +FRE,t_[uli],-38.20,10.02,winter +FRE,t_[uli],-36.55,18.07,spring +FRE,t_[uli],-12.61,22.82,summer +FRE,t_[uli],-33.93,17.69,fall +CP1,p_[ul],747.90,826.00, +CP1,p_i,-252.10,-174.00, +CP1,wspd_[uli],-12.00,31.70, +CP1,t_[uli],-58.30,1.10,winter +CP1,t_[uli],-46.40,6.70,spring +CP1,t_[uli],-26.70,10.80,summer +CP1,t_[uli],-49.30,10.40,fall +KAN_U,p_[ul],755.36,834.53, +KAN_U,p_i,-244.64,-165.47, +KAN_U,wspd_[uli],-12.00,33.94, +KAN_U,t_[uli],-53.53,2.20,winter +KAN_U,t_[uli],-52.57,10.38,spring +KAN_U,t_[uli],-26.67,11.61,summer +KAN_U,t_[uli],-47.97,9.90,fall +KAN_L,p_[ul],884.10,967.42, +KAN_L,p_i,-115.90,-32.58, +KAN_L,wspd_[uli],-12.00,28.45, +KAN_L,t_[uli],-41.57,11.24,winter +KAN_L,t_[uli],-40.69,15.09,spring +KAN_L,t_[uli],-13.58,16.95,summer +KAN_L,t_[uli],-33.55,15.56,fall +QAS_Uv3,p_[ul],859.94,946.06, +QAS_Uv3,p_i,-140.06,-53.94, +QAS_Uv3,wspd_[uli],-12.00,29.09, +QAS_Uv3,t_[uli],-36.70,7.70,winter +QAS_Uv3,t_[uli],-28.80,13.10,spring +QAS_Uv3,t_[uli],-12.29,15.20,summer +QAS_Uv3,t_[uli],-25.20,14.90,fall +ZAK_L,p_[ul],887.39,966.32, +ZAK_L,p_i,-112.61,-33.68, +ZAK_L,wspd_[uli],-11.97,25.32, +ZAK_L,t_[uli],-37.76,11.67,winter +ZAK_L,t_[uli],-35.50,16.92,spring +ZAK_L,t_[uli],-13.01,21.22,summer +ZAK_L,t_[uli],-33.80,14.72,fall +NEM,p_[ul],657.30,1013.98, +NEM,p_i,-342.70,13.98, +NEM,wspd_[uli],-12.00,25.85, +NEM,t_[uli],-30.58,7.20,summer +NEM,t_[uli],-58.60,-6.10,fall +NUK_U,p_[ul],832.02,911.63, +NUK_U,p_i,-167.98,-88.37, +NUK_U,wspd_[uli],-12.00,32.44, +NUK_U,t_[uli],-41.31,9.59,winter +NUK_U,t_[uli],-41.67,14.44,spring +NUK_U,t_[uli],-15.56,16.13,summer +NUK_U,t_[uli],-34.98,13.45,fall +TAS_A,p_[ul],847.23,942.82, +TAS_A,p_i,-152.77,-57.18, +TAS_A,wspd_[uli],-12.00,37.72, +TAS_A,t_[uli],-36.02,7.45,winter +TAS_A,t_[uli],-32.13,15.79,spring +TAS_A,t_[uli],-15.13,19.41,summer +TAS_A,t_[uli],-28.88,13.28,fall +HUM,p_[ul],669.63,963.00, +HUM,p_i,-330.37,-37.00, +HUM,wspd_[uli],-11.12,32.84, +HUM,t_[uli],-58.80,-2.63,winter +HUM,t_[uli],-27.90,9.70,summer +HUM,t_[uli],-56.20,9.10,fall +NUK_N,p_[ul],854.24,934.01, +NUK_N,p_i,-145.76,-65.99, +NUK_N,wspd_[uli],-12.00,32.87, +NUK_N,t_[uli],-37.90,7.89,winter +NUK_N,t_[uli],-35.67,15.68,spring +NUK_N,t_[uli],-16.90,14.72,summer +NUK_N,t_[uli],-32.96,12.92,fall +TAS_U,p_[ul],885.32,979.02, +TAS_U,p_i,-114.68,-20.98, +TAS_U,wspd_[uli],-12.00,38.30, +TAS_U,t_[uli],-31.91,10.66,winter +TAS_U,t_[uli],-33.16,13.20,spring +TAS_U,t_[uli],-12.88,16.75,summer +TAS_U,t_[uli],-25.74,14.06,fall +KAN_B,p_[ul],920.85,1004.05, +KAN_B,p_i,-79.15,4.05, +KAN_B,wspd_[uli],-12.00,24.36, +KAN_B,t_[uli],-40.72,12.98,winter +KAN_B,t_[uli],-40.23,19.02,spring +KAN_B,t_[uli],-10.71,22.39,summer +KAN_B,t_[uli],-32.22,18.16,fall +TAS_L,p_[ul],721.70,1021.25, +TAS_L,p_i,-278.30,21.25, +TAS_L,wspd_[uli],-12.00,38.73, +TAS_L,t_[uli],-29.33,12.68,winter +TAS_L,t_[uli],-30.18,18.26,spring +TAS_L,t_[uli],-11.96,18.32,summer +TAS_L,t_[uli],-22.81,15.56,fall +NAE,p_[ul],708.75,752.65, +NAE,p_i,-291.25,-247.35, +NAE,wspd_[uli],-11.33,23.10, +NAE,t_[uli],-18.36,5.64,spring +NAE,t_[uli],-21.17,6.71,summer +SCO_U,p_[ul],848.32,931.61, +SCO_U,p_i,-151.68,-68.39, +SCO_U,wspd_[uli],-11.81,22.90, +SCO_U,t_[uli],-41.38,3.73,winter +SCO_U,t_[uli],-41.01,13.32,spring +SCO_U,t_[uli],-15.98,15.61,summer +SCO_U,t_[uli],-36.27,12.11,fall +SWC,p_[ul],831.00,1037.70, +SWC,p_i,-169.00,37.70, +SWC,wspd_[uli],-12.00,30.26, +SWC,t_[uli],-43.60,5.80,winter +SWC,t_[uli],-50.30,11.70,spring +SWC,t_[uli],-19.20,12.90,summer +SWC,t_[uli],-40.10,9.60,fall +NUK_Uv3,p_[ul],830.00,910.00, +NUK_Uv3,p_i,-170.00,-90.00, +NUK_Uv3,wspd_[uli],-12.00,29.36, +NUK_Uv3,t_[uli],-43.00,7.40,winter +NUK_Uv3,t_[uli],-38.00,12.10,spring +NUK_Uv3,t_[uli],-16.30,15.10,summer +NUK_Uv3,t_[uli],-15.80,13.60,fall +DY2,p_[ul],730.70,810.50, +DY2,p_i,-269.30,-189.50, +DY2,wspd_[uli],-11.67,35.46, +DY2,t_[uli],-54.50,-0.90,winter +DY2,t_[uli],-47.30,8.60,spring +DY2,t_[uli],-27.90,10.70,summer +DY2,t_[uli],-45.80,11.70,fall +SDM,p_[ul],650.30,734.00, +SDM,p_i,-349.70,-266.00, +SDM,wspd_[uli],-12.00,32.30, +SDM,t_[uli],-53.80,-3.70,winter +SDM,t_[uli],-47.43,-9.40,spring +SDM,t_[uli],-30.20,10.50,summer +SDM,t_[uli],-46.80,9.90,fall +SCO_L,p_[ul],907.50,993.49, +SCO_L,p_i,-92.50,-6.51, +SCO_L,wspd_[uli],-12.00,21.02, +SCO_L,t_[uli],-39.09,6.94,winter +SCO_L,t_[uli],-37.55,14.65,spring +SCO_L,t_[uli],-12.10,17.57,summer +SCO_L,t_[uli],-32.92,12.85,fall +CEN1,p_[ul],750.48,827.52, +CEN1,p_i,-249.52,-172.48, +CEN1,wspd_[uli],-12.00,27.66, +CEN1,t_[uli],-58.65,-5.02,winter +CEN1,t_[uli],-59.82,4.05,spring +CEN1,t_[uli],-29.83,9.84,summer +CEN1,t_[uli],-51.78,1.83,fall +JAR_O,p_[ul],857.10,936.40, +JAR_O,p_i,-142.90,-63.60, +JAR_O,wspd_[uli],-12.00,29.85, +JAR_O,t_[uli],-46.70,7.90,winter +JAR_O,t_[uli],-35.10,10.00,spring +JAR_O,t_[uli],-18.90,14.50,summer +JAR_O,t_[uli],-33.50,12.70,fall +THU_U,p_[ul],874.94,953.23, +THU_U,p_i,-125.06,-46.77, +THU_U,wspd_[uli],-12.00,33.52, +THU_U,t_[uli],-45.91,1.58,winter +THU_U,t_[uli],-45.70,10.07,spring +THU_U,t_[uli],-17.64,17.01,summer +THU_U,t_[uli],-35.83,8.64,fall +THU_U2,p_[ul],875.63,956.76, +THU_U2,p_i,-124.37,-43.24, +THU_U2,wspd_[uli],-12.00,30.97, +THU_U2,t_[uli],-50.90,4.42,winter +THU_U2,t_[uli],-45.73,11.36,spring +THU_U2,t_[uli],-17.57,16.11,summer +THU_U2,t_[uli],-38.17,10.34,fall +Roof_GEUS,p_[ul],990.00,1038.00, +Roof_GEUS,p_i,-10.00,38.00, +Roof_GEUS,wspd_[uli],-12.00,16.81, +Roof_GEUS,t_[uli],,,spring +Roof_GEUS,t_[uli],2.58,38.21,summer +SDL,p_[ul],714.80,820.00, +SDL,p_i,-285.20,-180.00, +SDL,wspd_[uli],-12.00,30.05, +SDL,t_[uli],-54.80,-0.10,winter +SDL,t_[uli],-26.50,10.80,summer +SDL,t_[uli],-49.90,9.80,fall +THU_L,p_[ul],898.11,980.80, +THU_L,p_i,-101.89,-19.20, +THU_L,wspd_[uli],-12.00,34.21, +THU_L,t_[uli],-45.49,4.93,winter +THU_L,t_[uli],-42.79,13.77,spring +THU_L,t_[uli],-15.51,18.35,summer +THU_L,t_[uli],-34.05,11.15,fall diff --git a/src/pypromice/test/test_percentile.py b/src/pypromice/test/test_percentile.py new file mode 100644 index 00000000..1f95cc11 --- /dev/null +++ b/src/pypromice/test/test_percentile.py @@ -0,0 +1,186 @@ +import unittest +from datetime import datetime +from typing import List + +import numpy as np +import pandas as pd + +from pypromice.qc.percentiles.outlier_detector import detect_outliers, filter_data + + +class PercentileQCTestCase(unittest.TestCase): + def test_column_pattern_matches(self): + self._test_column_pattern("p_i", True) + + def test_column_pattern_no_match(self): + self._test_column_pattern("p_l", False) + + def test_column_pattern_with_prefix(self): + self._test_column_pattern("prefix_p_i", False) + + def test_column_pattern_with_suffix(self): + self._test_column_pattern("p_i_suffix", False) + + def _test_column_pattern(self, column_name: str, expected_output: bool): + season_indices = pd.DatetimeIndex( + [ + datetime(2022, 3, 1), + ] + ) + thresholds = pd.DataFrame( + [ + dict( + stid="stid", variable_pattern="p_[iu]", lo=-100, hi=100, season=None + ), + ] + ) + value_outside_range = -325 + input_data = pd.DataFrame( + index=season_indices, columns=[column_name], data=[value_outside_range] + ) + if expected_output: + expected_mask = pd.DataFrame( + index=season_indices, columns=[column_name], data=[expected_output] + ) + else: + expected_mask = pd.DataFrame(index=season_indices, columns=[], data=[]) + + mask = detect_outliers(input_data, thresholds) + + pd.testing.assert_frame_equal(expected_mask, mask) + + def test_column_pattern_multicolumns(self): + thresholds = pd.DataFrame( + [ + dict( + stid="stid", variable_pattern="p_[iu]", lo=-100, hi=100, season=None + ), + ] + ) + date_index = pd.DatetimeIndex([datetime(2022, 3, 1)]) + input_data = pd.DataFrame( + index=date_index, + data=[ + dict( + p_i=-10, + p_u=1000, + p_j=1000, + ) + ], + ) + # p_j is not in the mask because it doesn't match the pattern + expected_mask = pd.DataFrame( + index=date_index, + data=[ + dict( + p_i=False, + p_u=True, + ) + ], + ) + + mask = detect_outliers(input_data, thresholds) + + pd.testing.assert_frame_equal(expected_mask, mask) + + def test_no_season(self): + season_indices = pd.DatetimeIndex( + [ + datetime(2022, 3, 1), + datetime(2022, 8, 1), + ] + ) + thresholds = pd.DataFrame( + [ + dict(stid="stid", variable_pattern="p_i", lo=-100, hi=100, season=None), + ] + ) + input_data = pd.DataFrame(index=season_indices, columns=["p_i"], data=[0, -243]) + expected_mask = pd.DataFrame( + index=season_indices, columns=["p_i"], data=[False, True] + ) + + mask = detect_outliers(input_data, thresholds) + + pd.testing.assert_frame_equal(expected_mask, mask) + + def test_season_filter_invalid_winter_and_spring(self): + self._test_season_filter( + input_values=[0, 0, 0, 0], expected_mask=[True, True, False, False] + ) + + def test_season_filter_invalid_summer(self): + self._test_season_filter( + input_values=[-10, -10, -10, -10], expected_mask=[False, False, True, False] + ) + + def test_season_filter_valid_season_values(self): + self._test_season_filter( + input_values=[-12, -8, -1, -3], expected_mask=[False, False, False, False] + ) + + def _test_season_filter(self, input_values: List[float], expected_mask: List[bool]): + stid = "A_STID" + thresholds = pd.DataFrame( + [ + dict( + stid=stid, variable_pattern="t_i", lo=-20, hi=-10, season="winter" + ), + dict(stid=stid, variable_pattern="t_i", lo=-10, hi=-1, season="spring"), + dict(stid=stid, variable_pattern="t_i", lo=-5, hi=5, season="summer"), + dict(stid=stid, variable_pattern="t_i", lo=-10, hi=0, season="fall"), + ] + ) + season_indices = pd.DatetimeIndex( + [ + datetime(2021, 12, 1), # winter + datetime(2022, 3, 1), # spring + datetime(2022, 6, 1), # summer + datetime(2022, 9, 1), # fall + ] + ) + input_data = pd.DataFrame( + index=season_indices, columns=["t_i"], data=input_values + ) + expected_mask = pd.DataFrame( + index=season_indices, columns=["t_i"], data=expected_mask + ) + + mask = detect_outliers(input_data, thresholds) + + pd.testing.assert_frame_equal(expected_mask, mask) + + def test_remove_outliers(self): + thresholds = pd.DataFrame( + columns=[ + "stid", + "variable_pattern", + "lo", + "hi", + "season", + ], + data=[ + ["stid", "t_[iu]", -40, 0, "winter"], + ["stid", "t_[iu]", -4, 10, "summer"], + ], + ) + date_index = pd.DatetimeIndex( + [ + datetime(2022, 1, 1), + datetime(2022, 8, 1), + ] + ) + input_data = pd.DataFrame( + index=date_index, + data=[ + dict(t_i=-10, p_u=994), + dict(t_i=37, p_u=1024), + ], + ) + mask = detect_outliers(input_data, thresholds) + expected_output_data = input_data.copy() + expected_output_data[mask] = np.nan + + output_data = filter_data(input_data, thresholds) + self.assertIsNot(output_data, input_data) + pd.testing.assert_frame_equal(output_data, expected_output_data)