From 475805fb4093c3f2dd5ed7540c332a326e1f8151 Mon Sep 17 00:00:00 2001
From: Mads Christian Lund <maclu@geus.dk>
Date: Thu, 2 Nov 2023 16:09:25 +0100
Subject: [PATCH] Implemented outlier detector class for handling configuration
 state

* Implemented core functionality for season based threshold filter
* Implemented script to compute thresholds using historical L1 (inferred) data
* Added default thresholds
* Updated setup.py to allow thresholds data file
---
 setup.py                                      |   1 +
 src/pypromice/process/L1toL2.py               |   6 +-
 src/pypromice/qc/percentiles/__init__.py      |   0
 .../qc/percentiles/compute_thresholds.py      | 221 +++++++++++++
 .../qc/percentiles/outlier_detector.py        | 114 +++++++
 src/pypromice/qc/percentiles/thresholds.csv   | 312 ++++++++++++++++++
 src/pypromice/test/test_percentile.py         | 186 +++++++++++
 7 files changed, 839 insertions(+), 1 deletion(-)
 create mode 100644 src/pypromice/qc/percentiles/__init__.py
 create mode 100644 src/pypromice/qc/percentiles/compute_thresholds.py
 create mode 100644 src/pypromice/qc/percentiles/outlier_detector.py
 create mode 100644 src/pypromice/qc/percentiles/thresholds.csv
 create mode 100644 src/pypromice/test/test_percentile.py

diff --git a/setup.py b/setup.py
index 6a7a5111..92b18e3c 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
     include_package_data = True,
     packages=setuptools.find_packages(where="src"),
     python_requires=">=3.8",
+    package_data={"pypromice.qc.percentiles": ["thresholds.csv"]},
     install_requires=['numpy>=1.23.0', 'pandas>=1.5.0', 'xarray>=2022.6.0', 'toml', 'scipy>=1.9.0', 'scikit-learn>=1.1.0', 'Bottleneck', 'netcdf4', 'pyDataverse'],
     entry_points={
     'console_scripts': [
diff --git a/src/pypromice/process/L1toL2.py b/src/pypromice/process/L1toL2.py
index 406b2bdb..a1839715 100644
--- a/src/pypromice/process/L1toL2.py
+++ b/src/pypromice/process/L1toL2.py
@@ -9,6 +9,7 @@
 import xarray as xr
 
 from pypromice.qc.github_data_issues import flagNAN, adjustTime, adjustData
+from pypromice.qc.percentiles.outlier_detector import ThresholdBasedOutlierDetector
 from pypromice.qc.persistence import persistence_qc
 from pypromice.process.value_clipping import clip_values
 
@@ -66,7 +67,10 @@ def toL2(
         logger.exception('Flagging and fixing failed:')
 
     if ds.attrs['format'] == 'TX':
-        ds = persistence_qc(ds)                                               # Detect and filter data points that seems to be static
+        ds = persistence_qc(ds)                                               # Flag and remove persistence outliers
+        # TODO: The configuration should be provided explicitly
+        outlier_detector = ThresholdBasedOutlierDetector.default()
+        ds = outlier_detector.filter_data(ds)                                 # Flag and remove percentile outliers
 
     T_100 = _getTempK(T_0)
     ds['rh_u_cor'] = correctHumidity(ds['rh_u'], ds['t_u'],
diff --git a/src/pypromice/qc/percentiles/__init__.py b/src/pypromice/qc/percentiles/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/pypromice/qc/percentiles/compute_thresholds.py b/src/pypromice/qc/percentiles/compute_thresholds.py
new file mode 100644
index 00000000..f7899472
--- /dev/null
+++ b/src/pypromice/qc/percentiles/compute_thresholds.py
@@ -0,0 +1,221 @@
+import sys
+from datetime import datetime
+
+import pandas as pd
+
+from pypromice.process import AWS
+from pathlib import Path
+import logging
+from pypromice.qc.github_data_issues import adjustTime, flagNAN, adjustData
+
+
+# %%
+logger = logging.getLogger("ComputeThreshold")
+
+
+# %%
+def compute_all_thresholds(
+    station_thresholds_root: Path,
+    thresholds_output_path: Path,
+    aws_l0_repo_path: Path,
+    start_time: datetime,
+    end_time: datetime,
+):
+    logger.info("Computing all thresholds for stations available in the L0 repository")
+    logger.info(f"station_thresholds_root: {station_thresholds_root}")
+    logger.info(f"thresholds_output_path:  {thresholds_output_path}")
+    logger.info(f"aws_l0_repo_path:        {aws_l0_repo_path}")
+    logger.info(f"start_time:              {start_time}")
+    logger.info(f"end_time:                {end_time}")
+
+    station_thresholds_root.mkdir(parents=True, exist_ok=True)
+
+    # %%
+    output_paths = []
+    for config_path in aws_l0_repo_path.glob("raw/config/*.toml"):
+        stid = config_path.stem
+
+        logger.info(f"Processing {stid}")
+        data_path = aws_l0_repo_path.joinpath("raw", stid)
+        output_path = station_thresholds_root.joinpath(f"{stid}.csv")
+        try:
+            if not output_path.exists():
+                threshold = find_thresholds(
+                    stid,
+                    config_path,
+                    data_path,
+                    start_time,
+                    end_time,
+                )
+                threshold.to_csv(
+                    path_or_buf=output_path, index=False, float_format="{:.2f}".format
+                )
+            output_paths.append(output_path)
+        except Exception:
+            logger.exception(f"Failed processing {stid}")
+            continue
+
+    logger.info("Merge threshold files")
+    pd.concat(pd.read_csv(p) for p in output_paths).to_csv(
+        thresholds_output_path, index=False, float_format="{:.2f}".format
+    )
+
+
+def find_thresholds(
+    stid: str,
+    config_path: Path,
+    data_path: Path,
+    start_time: datetime,
+    end_time: datetime,
+) -> pd.DataFrame:
+    """
+    Compute variable threshold for a station using historical distribution quantiles.
+
+    Parameters
+    ----------
+    stid
+    config_path
+    data_path
+    start_time
+    end_time
+
+    Returns
+    -------
+    Upper and lower thresholds for a set of variables and seasons
+
+
+    """
+    stid_logger = logger.getChild(stid)
+    # %%
+
+    stid_logger.info("Read AWS data and get L1")
+    aws = AWS(config_file=config_path.as_posix(), inpath=data_path.as_posix())
+    aws.getL1()
+
+    # %%
+    stid_logger.info("Apply QC filters on data")
+    ds = aws.L1A.copy(deep=True)  # Reassign dataset
+    ds = adjustTime(ds)  # Adjust time after a user-defined csv files
+    ds = flagNAN(ds)  # Flag NaNs after a user-defined csv files
+    ds = adjustData(ds)
+
+    # %%
+    stid_logger.info("Determine thresholds")
+    df = (
+        ds[["rh_u", "wspd_u", "p_u", "t_u"]]
+        .to_pandas()
+        .loc[start_time:end_time]
+        .assign(season=lambda df: (df.index.month // 3) % 4)
+    )
+
+    threshold_rows = []
+
+    # Pressure
+    p_lo, p_hi = df["p_u"].quantile([0.005, 0.995]) + [-12, 12]
+    threshold_rows.append(
+        dict(
+            stid=stid,
+            variable_pattern="p_[ul]",
+            lo=p_lo,
+            hi=p_hi,
+        )
+    )
+    threshold_rows.append(
+        dict(
+            stid=stid,
+            variable_pattern="p_i",
+            lo=p_lo - 1000,
+            hi=p_hi - 1000,
+        )
+    )
+
+    # Wind speed
+    lo, hi = df["wspd_u"].quantile([0.005, 0.995]) + [-12, 12]
+    threshold_rows.append(
+        dict(
+            stid=stid,
+            variable_pattern="wspd_[uli]",
+            lo=lo,
+            hi=hi,
+        )
+    )
+
+    # Temperature
+    season_map = ["winter", "spring", "summer", "fall"]
+    for season_index, season_df in df[["t_u", "season"]].groupby(
+        (df.index.month // 3) % 4
+    ):
+        lo, hi = season_df.quantile([0.005, 0.995])["t_u"] + [-9, 9]
+
+        threshold_rows.append(
+            dict(
+                stid=stid,
+                variable_pattern="t_[uli]",
+                season=season_map[season_index],
+                lo=lo,
+                hi=hi,
+            )
+        )
+
+    threshold = pd.DataFrame(threshold_rows)
+    stid_logger.info(threshold)
+    return threshold
+    # %%
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--l0",
+        required=True,
+        type=Path,
+        help="L0 repository root path",
+    )
+    parser.add_argument(
+        "--thresholds_output_path",
+        "-o",
+        default=Path(__file__).parent.joinpath("thresholds.csv"),
+        type=Path,
+        help="Output csv file with thresholds for all stations",
+    )
+    parser.add_argument(
+        "--station_thresholds_root",
+        "--str",
+        default=Path(__file__).parent.joinpath("station_thresholds"),
+        type=Path,
+        help="Directory containing threshold files for the individual stations",
+    )
+    parser.add_argument(
+        "--start_time",
+        default="2000-01-01",
+        help="Start time for data series. Format: %Y-%m-%d",
+    )
+    parser.add_argument(
+        "--end_time",
+        default="2023-10-01",
+        help="End time for data series. Format: %Y-%m-%d",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="%(asctime)s; %(levelname)s; %(name)s; %(message)s",
+        level=logging.INFO,
+        stream=sys.stdout,
+    )
+
+    thresholds_output_path = args.thresholds_output_path
+    station_thresholds_root = args.station_thresholds_root
+
+    start_time = datetime.strptime(args.start_time, "%Y-%m-%d")
+    end_time = datetime.strptime(args.end_time, "%Y-%m-%d")
+    aws_l0_repo_path = args.l0
+
+    compute_all_thresholds(
+        station_thresholds_root=station_thresholds_root,
+        thresholds_output_path=thresholds_output_path,
+        aws_l0_repo_path=aws_l0_repo_path,
+        start_time=start_time,
+        end_time=end_time,
+    )
diff --git a/src/pypromice/qc/percentiles/outlier_detector.py b/src/pypromice/qc/percentiles/outlier_detector.py
new file mode 100644
index 00000000..dada58b6
--- /dev/null
+++ b/src/pypromice/qc/percentiles/outlier_detector.py
@@ -0,0 +1,114 @@
+from pathlib import Path
+
+import attrs
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+
+__all__ = [
+    "ThresholdBasedOutlierDetector"
+]
+
+season_month_map = {
+    "winter": {12, 1, 2},
+    "spring": {3, 4, 5},
+    "summer": {6, 7, 8},
+    "fall": {9, 10, 11},
+}
+
+
+def get_season_index_mask(data_set: pd.DataFrame, season: str) -> np.ndarray[bool]:
+    season_months = season_month_map.get(
+        season, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
+    )
+    return data_set.index.month.isin(season_months)[:, None]
+
+
+def detect_outliers(data_set: pd.DataFrame, thresholds: pd.DataFrame) -> pd.DataFrame:
+    masks = []
+
+    season_index_mask = {
+        season: get_season_index_mask(data_set, season)
+        for season in thresholds["season"].unique()
+    }
+
+    for variable_pattern, pattern_configs in thresholds.groupby("variable_pattern"):
+        df = data_set.filter(regex=f"^{variable_pattern}$")
+        mask = None
+        for _, season_config in pattern_configs.iterrows():
+            threshold_mask = (df < season_config.lo) | (df > season_config.hi)
+            season_mask = threshold_mask & season_index_mask[season_config.season]
+
+            if mask is None:
+                mask = season_mask
+            else:
+                mask |= season_mask
+        masks.append(mask)
+
+    return pd.concat(masks, axis=1)
+
+
+def filter_data(data_set: pd.DataFrame, thresholds: pd.DataFrame) -> pd.DataFrame:
+    mask = detect_outliers(data_set, thresholds)
+    output_data = data_set.copy()
+    output_data[mask] = np.nan
+    return output_data
+
+
+@attrs.define
+class ThresholdBasedOutlierDetector:
+    thresholds: pd.DataFrame = attrs.field()
+
+    def filter_data(self, ds: xr.Dataset) -> xr.Dataset:
+        """
+        Filter samples across all variables by assigning to nan
+        """
+        stid = ds.station_id
+
+        stid_thresholds = self.thresholds.query(f"stid == '{stid}'")
+        if not stid_thresholds.any():
+            return ds
+
+        data_df = ds.to_dataframe()  # Switch to pandas
+        data_df = filter_data(
+            data_set=data_df,
+            thresholds=stid_thresholds,
+        )
+
+        ds_out: xr.Dataset = data_df.to_xarray()
+        ds_out = ds_out.assign_attrs(ds.attrs)  # Dataset attrs
+        for x in ds_out.data_vars:  # variable-specific attrs
+            ds_out[x].attrs = ds[x].attrs
+
+        return ds_out
+
+    @classmethod
+    def from_csv_config(cls, config_file: Path) -> 'ThresholdBasedOutlierDetector':
+        """
+        Instantiate using explicit csv file with explicit thresholds
+
+        The CSV file shall have the format:
+
+        * Comma separated
+        * First row is header
+        * Columns
+            * stid: Station id
+            * variabel_pattern: regular expression filtering the variable name
+            * lo: Low threshold
+            * hi: High threshold
+            * season: The season of the filter: [, winter, spring, summer, fall]. The empty string means all seasons
+
+        """
+        return cls(thresholds=pd.read_csv(config_file))
+
+    @classmethod
+    def default(cls) -> 'ThresholdBasedOutlierDetector':
+        """
+        Instantiate using aws thresholds stored in the python package.
+        Returns
+        -------
+
+        """
+        default_thresholds_path = Path(__file__).parent.joinpath('thresholds.csv')
+        return cls.from_csv_config(default_thresholds_path)
diff --git a/src/pypromice/qc/percentiles/thresholds.csv b/src/pypromice/qc/percentiles/thresholds.csv
new file mode 100644
index 00000000..9bf50727
--- /dev/null
+++ b/src/pypromice/qc/percentiles/thresholds.csv
@@ -0,0 +1,312 @@
+stid,variable_pattern,lo,hi,season
+NUK_K,p_[ul],877.96,958.79,
+NUK_K,p_i,-122.04,-41.21,
+NUK_K,wspd_[uli],-12.00,24.74,
+NUK_K,t_[uli],-34.22,11.27,winter
+NUK_K,t_[uli],-33.10,16.28,spring
+NUK_K,t_[uli],-12.57,21.40,summer
+NUK_K,t_[uli],-27.04,15.72,fall
+SWC_O,p_[ul],831.20,909.80,
+SWC_O,p_i,-168.80,-90.20,
+SWC_O,wspd_[uli],-11.68,30.80,
+SWC_O,t_[uli],-49.84,6.70,winter
+SWC_O,t_[uli],-36.90,9.20,spring
+SWC_O,t_[uli],-20.90,13.40,summer
+SWC_O,t_[uli],-36.82,12.00,fall
+KPC_L,p_[ul],924.86,1006.19,
+KPC_L,p_i,-75.14,6.19,
+KPC_L,wspd_[uli],-11.89,29.23,
+KPC_L,t_[uli],-45.45,2.16,winter
+KPC_L,t_[uli],-44.70,9.46,spring
+KPC_L,t_[uli],-16.81,16.08,summer
+KPC_L,t_[uli],-39.66,9.46,fall
+EGP,p_[ul],675.50,752.80,
+EGP,p_i,-324.50,-247.20,
+EGP,wspd_[uli],-12.00,24.10,
+EGP,t_[uli],-69.45,-10.18,winter
+EGP,t_[uli],-66.58,1.41,spring
+EGP,t_[uli],-37.66,8.45,summer
+EGP,t_[uli],-62.60,4.00,fall
+CEN2,p_[ul],762.74,826.00,
+CEN2,p_i,-237.26,-174.00,
+CEN2,wspd_[uli],-12.00,26.81,
+CEN2,t_[uli],-36.00,-6.10,winter
+CEN2,t_[uli],-32.30,2.20,spring
+CEN2,t_[uli],-30.50,7.60,summer
+CEN2,t_[uli],-53.70,-1.60,fall
+MIT,p_[ul],900.19,993.97,
+MIT,p_i,-99.81,-6.03,
+MIT,wspd_[uli],-12.00,28.26,
+MIT,t_[uli],-29.36,10.81,winter
+MIT,t_[uli],-31.05,14.54,spring
+MIT,t_[uli],-11.51,19.61,summer
+MIT,t_[uli],-23.66,14.59,fall
+TUN,p_[ul],744.83,807.00,
+TUN,p_i,-255.17,-193.00,
+TUN,wspd_[uli],-11.85,24.81,
+TUN,t_[uli],-51.90,-3.65,winter
+TUN,t_[uli],-55.90,1.40,spring
+TUN,t_[uli],-31.70,8.90,summer
+TUN,t_[uli],-56.40,8.10,fall
+KPC_U,p_[ul],864.79,941.47,
+KPC_U,p_i,-135.21,-58.53,
+KPC_U,wspd_[uli],-12.00,27.73,
+KPC_U,t_[uli],-51.46,-0.09,winter
+KPC_U,t_[uli],-50.44,6.79,spring
+KPC_U,t_[uli],-21.11,12.77,summer
+KPC_U,t_[uli],-45.74,6.19,fall
+UPE_L,p_[ul],940.00,1023.48,
+UPE_L,p_i,-60.00,23.48,
+UPE_L,wspd_[uli],-12.00,29.37,
+UPE_L,t_[uli],-40.29,11.55,winter
+UPE_L,t_[uli],-40.90,15.97,spring
+UPE_L,t_[uli],-13.15,19.28,summer
+UPE_L,t_[uli],-29.23,15.26,fall
+KAN_M,p_[ul],816.70,896.46,
+KAN_M,p_i,-183.30,-103.54,
+KAN_M,wspd_[uli],-12.00,31.99,
+KAN_M,t_[uli],-50.27,6.21,winter
+KAN_M,t_[uli],-48.60,11.85,spring
+KAN_M,t_[uli],-21.57,12.59,summer
+KAN_M,t_[uli],-42.77,11.79,fall
+QAS_A,p_[ul],838.61,919.19,
+QAS_A,p_i,-161.39,-80.81,
+QAS_A,wspd_[uli],-12.00,28.55,
+QAS_A,t_[uli],-36.80,3.98,winter
+QAS_A,t_[uli],-36.71,17.11,spring
+QAS_A,t_[uli],-15.24,18.32,summer
+QAS_A,t_[uli],-31.06,13.03,fall
+NSE,p_[ul],694.70,784.80,
+NSE,p_i,-305.30,-215.20,
+NSE,wspd_[uli],-12.00,29.18,
+NSE,t_[uli],-56.60,-2.40,winter
+NSE,t_[uli],-53.54,-20.79,spring
+NSE,t_[uli],-29.20,11.00,summer
+NSE,t_[uli],-48.70,6.40,fall
+UPE_U,p_[ul],852.55,932.91,
+UPE_U,p_i,-147.45,-67.09,
+UPE_U,wspd_[uli],-11.93,29.59,
+UPE_U,t_[uli],-45.20,6.30,winter
+UPE_U,t_[uli],-45.68,13.19,spring
+UPE_U,t_[uli],-18.84,14.93,summer
+UPE_U,t_[uli],-36.32,11.26,fall
+QAS_M,p_[ul],884.52,970.17,
+QAS_M,p_i,-115.48,-29.83,
+QAS_M,wspd_[uli],-12.00,29.35,
+QAS_M,t_[uli],-34.30,11.12,winter
+QAS_M,t_[uli],-29.77,16.09,spring
+QAS_M,t_[uli],-11.57,16.11,summer
+QAS_M,t_[uli],-25.79,14.85,fall
+QAS_L,p_[ul],924.21,1013.41,
+QAS_L,p_i,-75.79,13.41,
+QAS_L,wspd_[uli],-12.00,30.08,
+QAS_L,t_[uli],-30.44,15.21,winter
+QAS_L,t_[uli],-29.98,17.38,spring
+QAS_L,t_[uli],-9.46,18.98,summer
+QAS_L,t_[uli],-23.72,17.12,fall
+NUK_L,p_[ul],897.14,980.03,
+NUK_L,p_i,-102.86,-19.97,
+NUK_L,wspd_[uli],-12.00,27.23,
+NUK_L,t_[uli],-35.82,14.42,winter
+NUK_L,t_[uli],-35.35,18.04,spring
+NUK_L,t_[uli],-10.63,20.76,summer
+NUK_L,t_[uli],-29.31,17.66,fall
+NAU,p_[ul],715.60,783.20,
+NAU,p_i,-284.40,-216.80,
+NAU,wspd_[uli],-11.53,28.77,
+NAU,t_[uli],-58.30,-2.90,winter
+NAU,t_[uli],-49.70,1.50,spring
+NAU,t_[uli],-34.79,7.90,summer
+NAU,t_[uli],-55.70,9.50,fall
+JAR,p_[ul],881.00,929.00,
+JAR,p_i,-119.00,-71.00,
+JAR,wspd_[uli],-11.62,29.82,
+JAR,t_[uli],-14.60,13.30,summer
+FRE,p_[ul],638.31,799.82,
+FRE,p_i,-361.69,-200.18,
+FRE,wspd_[uli],-12.00,22.62,
+FRE,t_[uli],-38.20,10.02,winter
+FRE,t_[uli],-36.55,18.07,spring
+FRE,t_[uli],-12.61,22.82,summer
+FRE,t_[uli],-33.93,17.69,fall
+CP1,p_[ul],747.90,826.00,
+CP1,p_i,-252.10,-174.00,
+CP1,wspd_[uli],-12.00,31.70,
+CP1,t_[uli],-58.30,1.10,winter
+CP1,t_[uli],-46.40,6.70,spring
+CP1,t_[uli],-26.70,10.80,summer
+CP1,t_[uli],-49.30,10.40,fall
+KAN_U,p_[ul],755.36,834.53,
+KAN_U,p_i,-244.64,-165.47,
+KAN_U,wspd_[uli],-12.00,33.94,
+KAN_U,t_[uli],-53.53,2.20,winter
+KAN_U,t_[uli],-52.57,10.38,spring
+KAN_U,t_[uli],-26.67,11.61,summer
+KAN_U,t_[uli],-47.97,9.90,fall
+KAN_L,p_[ul],884.10,967.42,
+KAN_L,p_i,-115.90,-32.58,
+KAN_L,wspd_[uli],-12.00,28.45,
+KAN_L,t_[uli],-41.57,11.24,winter
+KAN_L,t_[uli],-40.69,15.09,spring
+KAN_L,t_[uli],-13.58,16.95,summer
+KAN_L,t_[uli],-33.55,15.56,fall
+QAS_Uv3,p_[ul],859.94,946.06,
+QAS_Uv3,p_i,-140.06,-53.94,
+QAS_Uv3,wspd_[uli],-12.00,29.09,
+QAS_Uv3,t_[uli],-36.70,7.70,winter
+QAS_Uv3,t_[uli],-28.80,13.10,spring
+QAS_Uv3,t_[uli],-12.29,15.20,summer
+QAS_Uv3,t_[uli],-25.20,14.90,fall
+ZAK_L,p_[ul],887.39,966.32,
+ZAK_L,p_i,-112.61,-33.68,
+ZAK_L,wspd_[uli],-11.97,25.32,
+ZAK_L,t_[uli],-37.76,11.67,winter
+ZAK_L,t_[uli],-35.50,16.92,spring
+ZAK_L,t_[uli],-13.01,21.22,summer
+ZAK_L,t_[uli],-33.80,14.72,fall
+NEM,p_[ul],657.30,1013.98,
+NEM,p_i,-342.70,13.98,
+NEM,wspd_[uli],-12.00,25.85,
+NEM,t_[uli],-30.58,7.20,summer
+NEM,t_[uli],-58.60,-6.10,fall
+NUK_U,p_[ul],832.02,911.63,
+NUK_U,p_i,-167.98,-88.37,
+NUK_U,wspd_[uli],-12.00,32.44,
+NUK_U,t_[uli],-41.31,9.59,winter
+NUK_U,t_[uli],-41.67,14.44,spring
+NUK_U,t_[uli],-15.56,16.13,summer
+NUK_U,t_[uli],-34.98,13.45,fall
+TAS_A,p_[ul],847.23,942.82,
+TAS_A,p_i,-152.77,-57.18,
+TAS_A,wspd_[uli],-12.00,37.72,
+TAS_A,t_[uli],-36.02,7.45,winter
+TAS_A,t_[uli],-32.13,15.79,spring
+TAS_A,t_[uli],-15.13,19.41,summer
+TAS_A,t_[uli],-28.88,13.28,fall
+HUM,p_[ul],669.63,963.00,
+HUM,p_i,-330.37,-37.00,
+HUM,wspd_[uli],-11.12,32.84,
+HUM,t_[uli],-58.80,-2.63,winter
+HUM,t_[uli],-27.90,9.70,summer
+HUM,t_[uli],-56.20,9.10,fall
+NUK_N,p_[ul],854.24,934.01,
+NUK_N,p_i,-145.76,-65.99,
+NUK_N,wspd_[uli],-12.00,32.87,
+NUK_N,t_[uli],-37.90,7.89,winter
+NUK_N,t_[uli],-35.67,15.68,spring
+NUK_N,t_[uli],-16.90,14.72,summer
+NUK_N,t_[uli],-32.96,12.92,fall
+TAS_U,p_[ul],885.32,979.02,
+TAS_U,p_i,-114.68,-20.98,
+TAS_U,wspd_[uli],-12.00,38.30,
+TAS_U,t_[uli],-31.91,10.66,winter
+TAS_U,t_[uli],-33.16,13.20,spring
+TAS_U,t_[uli],-12.88,16.75,summer
+TAS_U,t_[uli],-25.74,14.06,fall
+KAN_B,p_[ul],920.85,1004.05,
+KAN_B,p_i,-79.15,4.05,
+KAN_B,wspd_[uli],-12.00,24.36,
+KAN_B,t_[uli],-40.72,12.98,winter
+KAN_B,t_[uli],-40.23,19.02,spring
+KAN_B,t_[uli],-10.71,22.39,summer
+KAN_B,t_[uli],-32.22,18.16,fall
+TAS_L,p_[ul],721.70,1021.25,
+TAS_L,p_i,-278.30,21.25,
+TAS_L,wspd_[uli],-12.00,38.73,
+TAS_L,t_[uli],-29.33,12.68,winter
+TAS_L,t_[uli],-30.18,18.26,spring
+TAS_L,t_[uli],-11.96,18.32,summer
+TAS_L,t_[uli],-22.81,15.56,fall
+NAE,p_[ul],708.75,752.65,
+NAE,p_i,-291.25,-247.35,
+NAE,wspd_[uli],-11.33,23.10,
+NAE,t_[uli],-18.36,5.64,spring
+NAE,t_[uli],-21.17,6.71,summer
+SCO_U,p_[ul],848.32,931.61,
+SCO_U,p_i,-151.68,-68.39,
+SCO_U,wspd_[uli],-11.81,22.90,
+SCO_U,t_[uli],-41.38,3.73,winter
+SCO_U,t_[uli],-41.01,13.32,spring
+SCO_U,t_[uli],-15.98,15.61,summer
+SCO_U,t_[uli],-36.27,12.11,fall
+SWC,p_[ul],831.00,1037.70,
+SWC,p_i,-169.00,37.70,
+SWC,wspd_[uli],-12.00,30.26,
+SWC,t_[uli],-43.60,5.80,winter
+SWC,t_[uli],-50.30,11.70,spring
+SWC,t_[uli],-19.20,12.90,summer
+SWC,t_[uli],-40.10,9.60,fall
+NUK_Uv3,p_[ul],830.00,910.00,
+NUK_Uv3,p_i,-170.00,-90.00,
+NUK_Uv3,wspd_[uli],-12.00,29.36,
+NUK_Uv3,t_[uli],-43.00,7.40,winter
+NUK_Uv3,t_[uli],-38.00,12.10,spring
+NUK_Uv3,t_[uli],-16.30,15.10,summer
+NUK_Uv3,t_[uli],-15.80,13.60,fall
+DY2,p_[ul],730.70,810.50,
+DY2,p_i,-269.30,-189.50,
+DY2,wspd_[uli],-11.67,35.46,
+DY2,t_[uli],-54.50,-0.90,winter
+DY2,t_[uli],-47.30,8.60,spring
+DY2,t_[uli],-27.90,10.70,summer
+DY2,t_[uli],-45.80,11.70,fall
+SDM,p_[ul],650.30,734.00,
+SDM,p_i,-349.70,-266.00,
+SDM,wspd_[uli],-12.00,32.30,
+SDM,t_[uli],-53.80,-3.70,winter
+SDM,t_[uli],-47.43,-9.40,spring
+SDM,t_[uli],-30.20,10.50,summer
+SDM,t_[uli],-46.80,9.90,fall
+SCO_L,p_[ul],907.50,993.49,
+SCO_L,p_i,-92.50,-6.51,
+SCO_L,wspd_[uli],-12.00,21.02,
+SCO_L,t_[uli],-39.09,6.94,winter
+SCO_L,t_[uli],-37.55,14.65,spring
+SCO_L,t_[uli],-12.10,17.57,summer
+SCO_L,t_[uli],-32.92,12.85,fall
+CEN1,p_[ul],750.48,827.52,
+CEN1,p_i,-249.52,-172.48,
+CEN1,wspd_[uli],-12.00,27.66,
+CEN1,t_[uli],-58.65,-5.02,winter
+CEN1,t_[uli],-59.82,4.05,spring
+CEN1,t_[uli],-29.83,9.84,summer
+CEN1,t_[uli],-51.78,1.83,fall
+JAR_O,p_[ul],857.10,936.40,
+JAR_O,p_i,-142.90,-63.60,
+JAR_O,wspd_[uli],-12.00,29.85,
+JAR_O,t_[uli],-46.70,7.90,winter
+JAR_O,t_[uli],-35.10,10.00,spring
+JAR_O,t_[uli],-18.90,14.50,summer
+JAR_O,t_[uli],-33.50,12.70,fall
+THU_U,p_[ul],874.94,953.23,
+THU_U,p_i,-125.06,-46.77,
+THU_U,wspd_[uli],-12.00,33.52,
+THU_U,t_[uli],-45.91,1.58,winter
+THU_U,t_[uli],-45.70,10.07,spring
+THU_U,t_[uli],-17.64,17.01,summer
+THU_U,t_[uli],-35.83,8.64,fall
+THU_U2,p_[ul],875.63,956.76,
+THU_U2,p_i,-124.37,-43.24,
+THU_U2,wspd_[uli],-12.00,30.97,
+THU_U2,t_[uli],-50.90,4.42,winter
+THU_U2,t_[uli],-45.73,11.36,spring
+THU_U2,t_[uli],-17.57,16.11,summer
+THU_U2,t_[uli],-38.17,10.34,fall
+Roof_GEUS,p_[ul],990.00,1038.00,
+Roof_GEUS,p_i,-10.00,38.00,
+Roof_GEUS,wspd_[uli],-12.00,16.81,
+Roof_GEUS,t_[uli],,,spring
+Roof_GEUS,t_[uli],2.58,38.21,summer
+SDL,p_[ul],714.80,820.00,
+SDL,p_i,-285.20,-180.00,
+SDL,wspd_[uli],-12.00,30.05,
+SDL,t_[uli],-54.80,-0.10,winter
+SDL,t_[uli],-26.50,10.80,summer
+SDL,t_[uli],-49.90,9.80,fall
+THU_L,p_[ul],898.11,980.80,
+THU_L,p_i,-101.89,-19.20,
+THU_L,wspd_[uli],-12.00,34.21,
+THU_L,t_[uli],-45.49,4.93,winter
+THU_L,t_[uli],-42.79,13.77,spring
+THU_L,t_[uli],-15.51,18.35,summer
+THU_L,t_[uli],-34.05,11.15,fall
diff --git a/src/pypromice/test/test_percentile.py b/src/pypromice/test/test_percentile.py
new file mode 100644
index 00000000..1f95cc11
--- /dev/null
+++ b/src/pypromice/test/test_percentile.py
@@ -0,0 +1,186 @@
+import unittest
+from datetime import datetime
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from pypromice.qc.percentiles.outlier_detector import detect_outliers, filter_data
+
+
+class PercentileQCTestCase(unittest.TestCase):
+    def test_column_pattern_matches(self):
+        self._test_column_pattern("p_i", True)
+
+    def test_column_pattern_no_match(self):
+        self._test_column_pattern("p_l", False)
+
+    def test_column_pattern_with_prefix(self):
+        self._test_column_pattern("prefix_p_i", False)
+
+    def test_column_pattern_with_suffix(self):
+        self._test_column_pattern("p_i_suffix", False)
+
+    def _test_column_pattern(self, column_name: str, expected_output: bool):
+        season_indices = pd.DatetimeIndex(
+            [
+                datetime(2022, 3, 1),
+            ]
+        )
+        thresholds = pd.DataFrame(
+            [
+                dict(
+                    stid="stid", variable_pattern="p_[iu]", lo=-100, hi=100, season=None
+                ),
+            ]
+        )
+        value_outside_range = -325
+        input_data = pd.DataFrame(
+            index=season_indices, columns=[column_name], data=[value_outside_range]
+        )
+        if expected_output:
+            expected_mask = pd.DataFrame(
+                index=season_indices, columns=[column_name], data=[expected_output]
+            )
+        else:
+            expected_mask = pd.DataFrame(index=season_indices, columns=[], data=[])
+
+        mask = detect_outliers(input_data, thresholds)
+
+        pd.testing.assert_frame_equal(expected_mask, mask)
+
+    def test_column_pattern_multicolumns(self):
+        thresholds = pd.DataFrame(
+            [
+                dict(
+                    stid="stid", variable_pattern="p_[iu]", lo=-100, hi=100, season=None
+                ),
+            ]
+        )
+        date_index = pd.DatetimeIndex([datetime(2022, 3, 1)])
+        input_data = pd.DataFrame(
+            index=date_index,
+            data=[
+                dict(
+                    p_i=-10,
+                    p_u=1000,
+                    p_j=1000,
+                )
+            ],
+        )
+        # p_j is not in the mask because it doesn't match the pattern
+        expected_mask = pd.DataFrame(
+            index=date_index,
+            data=[
+                dict(
+                    p_i=False,
+                    p_u=True,
+                )
+            ],
+        )
+
+        mask = detect_outliers(input_data, thresholds)
+
+        pd.testing.assert_frame_equal(expected_mask, mask)
+
+    def test_no_season(self):
+        season_indices = pd.DatetimeIndex(
+            [
+                datetime(2022, 3, 1),
+                datetime(2022, 8, 1),
+            ]
+        )
+        thresholds = pd.DataFrame(
+            [
+                dict(stid="stid", variable_pattern="p_i", lo=-100, hi=100, season=None),
+            ]
+        )
+        input_data = pd.DataFrame(index=season_indices, columns=["p_i"], data=[0, -243])
+        expected_mask = pd.DataFrame(
+            index=season_indices, columns=["p_i"], data=[False, True]
+        )
+
+        mask = detect_outliers(input_data, thresholds)
+
+        pd.testing.assert_frame_equal(expected_mask, mask)
+
+    def test_season_filter_invalid_winter_and_spring(self):
+        self._test_season_filter(
+            input_values=[0, 0, 0, 0], expected_mask=[True, True, False, False]
+        )
+
+    def test_season_filter_invalid_summer(self):
+        self._test_season_filter(
+            input_values=[-10, -10, -10, -10], expected_mask=[False, False, True, False]
+        )
+
+    def test_season_filter_valid_season_values(self):
+        self._test_season_filter(
+            input_values=[-12, -8, -1, -3], expected_mask=[False, False, False, False]
+        )
+
+    def _test_season_filter(self, input_values: List[float], expected_mask: List[bool]):
+        stid = "A_STID"
+        thresholds = pd.DataFrame(
+            [
+                dict(
+                    stid=stid, variable_pattern="t_i", lo=-20, hi=-10, season="winter"
+                ),
+                dict(stid=stid, variable_pattern="t_i", lo=-10, hi=-1, season="spring"),
+                dict(stid=stid, variable_pattern="t_i", lo=-5, hi=5, season="summer"),
+                dict(stid=stid, variable_pattern="t_i", lo=-10, hi=0, season="fall"),
+            ]
+        )
+        season_indices = pd.DatetimeIndex(
+            [
+                datetime(2021, 12, 1),  # winter
+                datetime(2022, 3, 1),  # spring
+                datetime(2022, 6, 1),  # summer
+                datetime(2022, 9, 1),  # fall
+            ]
+        )
+        input_data = pd.DataFrame(
+            index=season_indices, columns=["t_i"], data=input_values
+        )
+        expected_mask = pd.DataFrame(
+            index=season_indices, columns=["t_i"], data=expected_mask
+        )
+
+        mask = detect_outliers(input_data, thresholds)
+
+        pd.testing.assert_frame_equal(expected_mask, mask)
+
+    def test_remove_outliers(self):
+        thresholds = pd.DataFrame(
+            columns=[
+                "stid",
+                "variable_pattern",
+                "lo",
+                "hi",
+                "season",
+            ],
+            data=[
+                ["stid", "t_[iu]", -40, 0, "winter"],
+                ["stid", "t_[iu]", -4, 10, "summer"],
+            ],
+        )
+        date_index = pd.DatetimeIndex(
+            [
+                datetime(2022, 1, 1),
+                datetime(2022, 8, 1),
+            ]
+        )
+        input_data = pd.DataFrame(
+            index=date_index,
+            data=[
+                dict(t_i=-10, p_u=994),
+                dict(t_i=37, p_u=1024),
+            ],
+        )
+        mask = detect_outliers(input_data, thresholds)
+        expected_output_data = input_data.copy()
+        expected_output_data[mask] = np.nan
+
+        output_data = filter_data(input_data, thresholds)
+        self.assertIsNot(output_data, input_data)
+        pd.testing.assert_frame_equal(output_data, expected_output_data)