From a0a2dc861ebf43420510f1cb83bfb5167bd68a86 Mon Sep 17 00:00:00 2001
From: "David H. Hagan" <david.hagan@quant-aq.com>
Date: Tue, 23 Apr 2024 21:34:52 -0400
Subject: [PATCH] Adds support for a pollution rose

---
 atmospy/__init__.py |   3 +-
 atmospy/rcmod.py    |   2 +-
 atmospy/rose.py     | 181 +++++++++++
 atmospy/stats.py    | 732 --------------------------------------------
 atmospy/trends.py   |   3 -
 atmospy/utils.py    |  44 +--
 pyproject.toml      |   2 +-
 tests/test_rose.py  |  17 +
 8 files changed, 211 insertions(+), 773 deletions(-)
 create mode 100644 atmospy/rose.py
 delete mode 100644 atmospy/stats.py
 create mode 100644 tests/test_rose.py

diff --git a/atmospy/__init__.py b/atmospy/__init__.py
index a028046..c8b28ac 100644
--- a/atmospy/__init__.py
+++ b/atmospy/__init__.py
@@ -7,10 +7,11 @@
 # import os
 
 from .utils import *
+from .rcmod import *
 from .calendar import *
 from .relational import *
 from .trends import *
-from .rcmod import *
+from .rose import *
 
 # Capture the original matplotlib rcParams
 import matplotlib as mpl
diff --git a/atmospy/rcmod.py b/atmospy/rcmod.py
index 237b939..aaab282 100644
--- a/atmospy/rcmod.py
+++ b/atmospy/rcmod.py
@@ -2,7 +2,7 @@
 
 __all__ = ["set_theme"]
 
-def set_theme(context="notebook", style='white', palette='colorblind', 
+def set_theme(context="notebook", style='ticks', palette='colorblind', 
               font='sans-serif', font_scale=1, color_codes=True, rc=None):
     """_summary_
     
diff --git a/atmospy/rose.py b/atmospy/rose.py
new file mode 100644
index 0000000..9f6c155
--- /dev/null
+++ b/atmospy/rose.py
@@ -0,0 +1,181 @@
+"""This file contains the wind and pollution rose figures."""
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+from .utils import (
+    check_for_numeric_cols,
+)
+
+__all__ = ["pollutionroseplot"]
+
+def pollutionroseplot(data=None, *, ws=None, wd=None, pollutant=None, 
+                      faceted=False, segments=12, bins=[0, 10, 100, 1000], suffix="a.u.",
+                      calm=0., lw=1, legend=True, palette="flare",
+                      title=None, **kwargs):
+    """_summary_
+
+
+    The basis of this function can orginally found here: https://gist.github.com/phobson/41b41bdd157a2bcf6e14
+    
+    Parameters
+    ----------
+    data : _type_, optional
+        _description_, by default None
+    ws : _type_, optional
+        _description_, by default None
+    wd : _type_, optional
+        _description_, by default None
+    pollutant : _type_, optional
+        _description_, by default None
+    faceted : bool, optional
+        _description_, by default False
+    segments : int, optional
+        _description_, by default 12
+    bins : list, optional
+        _description_, by default [0, 10, 100, 1000]
+    suffix : str, optional
+        _description_, by default "a.u."
+    calm : _type_, optional
+        _description_, by default 0.
+    lw : int, optional
+        _description_, by default 1
+    legend : bool, optional
+        _description_, by default True
+    palette : str, optional
+        _description_, by default "flare"
+    title : _type_, optional
+        _description_, by default None
+    """
+    check_for_numeric_cols(data, [ws, wd, pollutant])
+
+    # if the bins don't end in inf, add it
+    if not np.isinf(bins[-1]):
+        bins.append(np.inf)
+        
+    # 
+    bins = np.asarray(bins)
+    
+    # setup the color palette
+    cp = sns.color_palette(palette, n_colors=bins.shape[0])
+    
+    # convert the number of segments into the wind bins
+    wd_segments = np.linspace(0, 360, segments+1)
+    
+    def _cat_pollutant_labels(bins, suffix):
+        """_summary_
+
+        Parameters
+        ----------
+        bins : _type_
+            _description_
+        suffix : _type_
+            _description_
+        """
+        list_of_labels = list()
+        for lowerbound, upperbound in zip(bins[:-1], bins[1:]):
+            if np.isinf(upperbound):
+                list_of_labels.append(f">{lowerbound:.0f} {suffix}")
+            else:
+                list_of_labels.append(f"{lowerbound:.0f} to {upperbound:.0f} {suffix}")
+                
+        return list_of_labels
+    
+    def _compute_bar_dims(thetas):
+        thetas = (thetas[:-1] + thetas[1:])/2.
+        
+        midpoints = [math.radians(theta) for theta in (thetas*np.pi/180. - np.pi/thetas.shape[0])]
+        width = math.radians(360./thetas.shape[0])
+        
+        return midpoints, width
+    
+    # compute the percentage of calm datapoints
+    # where calm is anything with a windspeed < `calm`
+    pct_calm = 100*data[data[ws] <= calm].shape[0] / data.shape[0]
+    
+    # group the data by bin and normalize
+    rv = (
+        data[data[ws] > calm]
+        .assign(
+            _cp=lambda x: pd.cut(
+                data[pollutant], 
+                bins=bins, 
+                right=True,
+                labels=_cat_pollutant_labels(bins, suffix)
+            )
+        )
+        .assign(
+            _wb=lambda x: pd.cut(
+                data[wd],
+                bins=wd_segments,
+                right=True,
+                labels=(wd_segments[:-1]+wd_segments[1:])/2.
+            )
+        )
+        .groupby(["_cp", "_wb"])
+        .size()
+        .unstack(level="_cp")
+        .fillna(0.)
+        .sort_index(axis=1)
+        .applymap(lambda x: 100 * data.shape[0])
+    )
+    
+    # compute the bar dims
+    bar_midpoints, bar_width = _compute_bar_dims(wd_segments)
+    
+    # if plotting onto a FacetGrid, get the current axis, otherwise create one
+    if faceted:
+        ax = plt.gca()
+    else:
+        fig = plt.gcf()
+        ax = fig.add_subplot(111, projection='polar')
+    
+    ax.set_theta_direction("clockwise")
+    ax.set_theta_zero_location("N")
+    
+    # determine the buffer at the center of the plot
+    # this is where ws <= `calm` and is evenly spread 
+    # across all angles
+    buffer = pct_calm / segments
+    
+    for i, (innerbar, outerbar) in enumerate(zip(rv.columns[:-1], rv.columns[1:])):
+        if i == 0:
+            # for the first bar, we need to plot the calm hole in the center first
+            ax.bar(
+                bar_midpoints,
+                rv[innerbar].values,
+                width=bar_width,
+                bottom=buffer,
+                label=innerbar,
+                lw=lw,
+                color=cp[i]
+            )
+            
+        ax.bar(
+            bar_midpoints,
+            rv[outerbar].values,
+            width=bar_width,
+            label=outerbar,
+            bottom=buffer + rv.cumsum(axis=1)[innerbar].values,
+            lw=lw,
+            color=cp[i+1]
+        )
+    
+    if legend:
+        ax.legend(
+            loc="center left",
+            handlelength=1, 
+            handleheight=1,
+            bbox_to_anchor=(1.1, 0, 0.5, 1)
+        )
+    
+    if title:
+        ax.set_title(title)
+    
+    # clean up the ticks and things
+    ax.set_xticks([math.radians(x) for x in [0, 45, 90, 135, 180, 225, 270, 315]])
+    ax.set_xticklabels(["N", "NE", "E", "SE", "S", "SW", "W", "NW"])
+    ax.set_yticklabels([])
+    
+    return ax
\ No newline at end of file
diff --git a/atmospy/stats.py b/atmospy/stats.py
deleted file mode 100644
index d5cb7a1..0000000
--- a/atmospy/stats.py
+++ /dev/null
@@ -1,732 +0,0 @@
-"""This file will be removed.
-"""
-
-# import numpy as np
-# import pandas as pd
-
-# ##### LOOKUP TABLES #####
-# _stats_ = {
-#     'mae': "Mean Absolute Error",
-#     'mape': "Mean Absolute Percentage Error",
-#     'mbe': "Mean Bias Error",
-#     'rmse': "Root Mean Squared Error",
-#     'mdae': "Median Absolute Error",
-#     'r2_score': "R2 Score"
-# }
-
-
-# _pollutants_ = {
-#     'co': "Carbon Monoxide (CO)",
-#     'pb': "Lead (Pb)",
-#     'no2': r"Nitrogen Dioxide ($NO_2$)",
-#     'o3': r"Ozone ($O_3$)",
-#     'so2': r"Sulphur Dioxide ($SO_2$)",
-#     'pm25': r"Particulate Matter smaller than 2.5 $\mu$m",
-#     'pm10': r"Particulate Matter smaller than 10 $\mu$m",
-# }
-
-# _units_ = {
-#     'co': "ppm",
-#     'pb': r"$\frac{\mu g}{m^3}$",
-#     'no2': "ppb",
-#     'o3': "ppm",
-#     'so2': "ppb",
-#     'pm25': r"$\frac{\mu g}{m^3}$",
-#     'pm10': r"$\frac{\mu g}{m^3}$",
-# }
-
-# ###################################
-# ###### PREDICTIVE REGRESSION ######
-# ###################################
-
-
-# ##### ERROR METRICS FUNCTIONS #####
-
-# EPSILON = 1e-10
-
-# def percentage_error(y_true, y_pred):
-#     """
-#     Percentage error
-#     Note: result is NOT multiplied by 100
-#     """
-#     return (y_true - y_pred) / (y_true + EPSILON)
-
-# def mean_abs_pct_error(y_true, y_pred):
-#     """
-#     Mean Absolute Percentage Error
-#     Note: result is NOT multiplied by 100
-#     """
-#     return np.mean(np.abs(percentage_error(y_true, y_pred)))
-
-# def mean_abs_error(y_true, y_pred):
-#     """
-#     Mean absolute error
-    
-#     """
-#     return np.mean(np.abs(y_true - y_pred))
-
-# def mean_bias_error(y_true, y_pred):
-#     """
-#     Mean absolute error
-    
-#     """
-#     return np.mean(y_true - y_pred)
-
-# def mean_squared_error(y_true, y_pred):
-#     """ 
-#     Mean squared error
-#     """
-#     return np.mean(np.square(y_true - y_pred))
-
-# def root_mean_squared_error(y_true, y_pred):
-#     """
-#     Root mean squared error
-#     """
-#     return np.sqrt(mean_squared_error(y_true, y_pred))
-
-# def median_absolute_error(y_true, y_pred):
-#     """ 
-#     Median Absolute Error
-#     """
-#     return np.median(np.abs(y_true - y_pred))
-
-# def r2_score(y_true, y_pred):
-#     """
-#     R^2 score -- adapted from sklearn documentation
-#     """
-#     numerator = ((y_true - y_pred) ** 2).sum(axis=0)
-#     denominator = ((y_true - np.average(y_true, axis=0)) ** 2).sum(axis=0) 
-    
-#     nonzero_denominator = denominator != 0
-#     nonzero_numerator = numerator != 0
-    
-#     valid_score = nonzero_denominator & nonzero_numerator
-#     output_scores = np.ones(y_true.shape)
-    
-#     output_scores[valid_score] = 1 - (numerator[valid_score] /
-#                                       denominator[valid_score])
-    
-#     return np.average(output_scores)
-
-# ##### COMPUTING PREDICTIVE REGRESSION STATS #####
-
-# METRICS = {
-#     'mae': ("Mean Absolute Error", mean_abs_error),
-#     'mape': ("Mean Absolute Percentage Error", mean_abs_pct_error),
-#     'mbe': ("Mean Bias Error", mean_bias_error),
-#     'rmse': ("Root Mean Squared Error", root_mean_squared_error),
-#     'mdae': ("Median Absolute Error", median_absolute_error),
-#     'r2_score': ("R-squared", r2_score),
-# }
-
-# def evaluate(y_true, y_pred, metrics=['mae', 'mape', 'mbe', 'rmse', 'mdae', 'r2_score']):
-#     results = {}
-#     for name in metrics:
-#         if name.lower() in METRICS.keys():
-#             label, f = METRICS.get(name.lower())
-#             results[name.lower()] = f(y_true, y_pred) 
-#         # later on, add logging for values not in the dict
-        
-#     return results
-
-
-# def stats(**kwargs):
-#     """
-#     Calculate statistics between two lists of data
-
-#     Accepts either a dataframe and column names, or two arrays
-    
-#     Returns a dictionary of computed statistics
-
-#     kwargs
-#     ------
-#         :param data: dataframe containing atmospheric chemistry data to be compared
-#         :type data: pandas Dataframe
-#         :param x: column name within data, or array-like data input
-#         :type x: str or array-like 
-#         :param y: column name within data, or array-like data input
-#         :type y: str or array-like 
-#         :param metrics: list of metrics to be computed between x and y data
-#                         default options = ['mae', 'mbe', 'rmse', 'mdae', 'r2score']
-#         :type metrics: list 
-        
-#     Returns
-#     -------
-#         z : dictionary of statistics
-        
-    
-#     References
-#     ----------
-    
-#     """
-#     x = kwargs.pop("x")
-#     y = kwargs.pop("y")
-#     data = kwargs.pop("data", None)
-    
-#     if data is not None:
-#         #dataframe and column names
-#         y_true = data[x]
-#         y_pred = data[y]
-#     elif isinstance(x, pd.Series) or isinstance(y, pd.Series):
-#         #Series inputs
-#         y_true = x.values
-#         y_pred = y.values
-#     else:
-#         #array-like inputs
-#         y_true = np.asarray(x)
-#         y_pred = np.asarray(y)
-    
-#     #same length error
-#     if len(y_true) != len(y_pred):
-#         raise ValueError("Inputs must have the same length")
-    
-#     #remove nans -- do we want to do this? and if so, is this the best way?
-#     clean_y_true = y_true[(np.isnan(y_true)==False) & (np.isnan(y_pred)==False)]
-#     clean_y_pred = y_pred[(np.isnan(y_true)==False) & (np.isnan(y_pred)==False)]
-    
-    
-#     return evaluate(clean_y_true, clean_y_pred)
-
-
-
-# def statstable(stats):
-#     """
-#     Print calculated stats in an aesthetic table
-    
-#     Inputs
-#     ------
-#     :param stats: Statistical data calculated using the stats function
-#     :type stats: dictionary
-    
-#     Returns
-#     -------
-#     None --> prints a table of statistics
-#     ::
-#     """
-#     s = ' '
-#     print(13*s+"Error Statistics"+13*s)
-#     print(42*'-')
-    
-#     for key, value in stats.items():
-#         l = len(key)
-#         print(_stats_[key]+s*(30 - len(_stats_[key]))+' = %#8.3f' % value)
-
-
-# ###################################
-# ######### EPA STATISTICS ##########
-# ###################################
-
-
-# ##### SPECIAL EPA PREFERRED STATS #####
-
-# def find_max_n(data, n):
-#     """
-#     Find the n largest values in an array
-#     """
-#     output = data.nlargest(n)
-        
-#     return output
-
-
-# def days_max_gt_std(data, std):
-#     """
-#     Find the number of days where the maximum daily measurement is
-#     above the standard used by the EPA
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should already be resampled to appropriate timebase
-#         :type data: pd.DataFrame or pd.Series
-#         :param std: standard value used by EPA
-#         :type std: float
-    
-#     Returns
-#     -------
-#         count: the number of days where the max value surpassed the standard
-#     """
-    
-#     daily_max = data.groupby(data.index.floor('d')).max()
-    
-#     count = daily_max[daily_max > std].count()
-    
-#     return count
-
-
-# def days_avg_gt_std(data, std):
-#     """
-#     Find the number of days where the maximum hourly measurement is
-#     above the 1-hour standard used by the EPA
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should already be resampled to appropriate timebase
-#         :type data: pd.DataFrame or pd.Series
-#         :param std: standard value used by EPA
-#         :type std: float
-    
-#     Returns
-#     -------
-#         count: the number of days where the average value surpassed the standard
-#     """
-    
-#     daily_avg = data.groupby(data.index.floor('d')).mean()
-    
-#     count = daily_avg[daily_avg > std].count()
-    
-#     return count
-
-
-# def percentile_of_daily_max(data, pct):
-#     """
-#     Find the nth percentile of daily maximum values for time-series data.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should already be resampled to appropriate timebase
-#         :type data: pd.DataFrame or pd.Series
-#         :param pct: percentile to be calculated
-#         :type std: float
-        
-#     Returns
-#     -------
-#         value: percentile value
-#     """
-    
-#     daily_max = data.groupby(data.index.floor('d')).max()
-    
-#     value = np.nanpercentile(daily_max, pct)
-    
-#     return value
-
-
-# ##### POLLUTANT SPECIFIC REPORTS #####
-
-# def co_stats(data):
-#     """
-#     Compute the EPA monitoring report values for CO. Data should be in ppm.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-#     # 1 hour statistics
-    
-#     #Highest and second-highest daily max 1-hour values
-#     max12_1h = find_max_n(data, 2)
-    
-#     #Number of daily max 1-hour values that exceeded the level of the 1-hour standard
-#     days_gt_std_1h = days_max_gt_std(data, 35)
-    
-#     # 8 hour statistics
-#     mean_8h = data.resample('8h').mean()
-    
-#     #Highest and second-highest daily max 8-hour values
-#     max12_8h = find_max_n(mean_8h, 2)
-    
-#     #Number of daily max 1-hour values that exceeded the level of the 8-hour standard
-#     days_gt_std_8h = days_max_gt_std(mean_8h, 9)
-    
-#     stats = {
-#         "co" : {
-#             "1h" : {
-#                 "mean": np.nanmean(data),
-#                 "min": np.nanmin(data),
-#                 "max_1": max12_1h[0],
-#                 "max_2": max12_1h[1],
-#                 "pct_25": np.nanpercentile(data, 25),
-#                 "pct_75": np.nanpercentile(data, 75),
-#                 "pct_98": np.nanpercentile(data, 98),
-#                 "days_max_gt_std": days_gt_std_1h,
-#             },
-#             "8h" : {
-#                 "mean": np.nanmean(mean_8h),
-#                 "min": np.nanmin(mean_8h),
-#                 "max_1": max12_8h[0],
-#                 "max_2": max12_8h[1],
-#                 "pct_25": np.nanpercentile(mean_8h, 25),
-#                 "pct_75": np.nanpercentile(mean_8h, 75),
-#                 "pct_98": np.nanpercentile(mean_8h, 98),
-#                 "days_max_gt_std": days_gt_std_8h,
-#             }
-#         }
-#     }
-    
-#     return stats
-    
-
-# def no2_stats(data):
-#     """
-#     Compute the EPA monitoring report values for NO2. Data should be in ppb.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-    
-#     # 1 hour statistics
-    
-#     #Highest and second-highest daily max 1-hour values
-#     max12_1h = find_max_n(data, 2)
-    
-#     # 98th percentile of daily max 1hr values
-#     pct_98_daily_max = percentile_of_daily_max(data, 98)
-    
-#     stats = {
-#         "no2": {
-#             "1h": {
-#                 "mean": np.nanmean(data),
-#                 "min": np.nanmin(data),
-#                 "max_1": max12_1h[0],
-#                 "max_2": max12_1h[1],
-#                 "pct_25": np.nanpercentile(data, 25),
-#                 "pct_75": np.nanpercentile(data, 75),
-#                 "pct_98": np.nanpercentile(data, 98),
-#                 "pct_98_daily_max": pct_98_daily_max,
-#             }
-#         }
-#     }
-    
-#     return stats
-
-
-# def pb_stats(data):
-#     """
-#     Compute the EPA monitoring report values for Pb. Data should be in micrograms per
-#     cubic meter.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-#     # 24 hour statistics
-#     mean_24h = data.resample('24h').mean()
-    
-#     #Highest, second-highest, third-highest, and fourth-highest daily max 24-hour values
-#     max1234 = find_max_n(data, 4)
-    
-#     stats = {
-#         "pb": {
-#             "24h": {
-#                 "mean": np.nanmean(mean_24h),
-#                 "min": np.nanmin(mean_24h),
-#                 "max_1": max1234[0],
-#                 "max_2": max1234[1],
-#                 "max_3": max1234[2],
-#                 "max_4": max1234[3],
-#                 "pct_25": np.nanpercentile(mean_24h, 25),
-#                 "pct_75": np.nanpercentile(mean_24h, 75),
-#                 "pct_98": np.nanpercentile(mean_24h, 98),
-#             }
-#         }
-#     }
-    
-#     return stats
-
-
-# def o3_stats(data):
-#     """
-#     Compute the EPA monitoring report values for Pb. Data should be in ppm.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-    
-#     # 1 hour statistics
-    
-#     #Highest, second-highest, third-highest, and fourth-highest daily max 1-hour values
-#     max1234_1h = find_max_n(data, 4)
-    
-#     #Number of daily max 1-hour values that exceeded the level of the 1-hour standard
-#     days_gt_std_1h = days_max_gt_std(data, 0.12)
-    
-#     # 8 hour statistics
-#     mean_8h = data.resample('8h').mean()
-    
-#     #Highest, second-highest, third-highest, and fourth-highest daily max 8-hour values
-#     max1234_8h = find_max_n(mean_8h, 4)
-    
-#     #Number of daily max 1-hour values that exceeded the level of the 8-hour standard
-#     days_gt_std_8h = days_max_gt_std(mean_8h, 0.07)
-    
-    
-#     stats = {
-#         "o3" : {
-#             "1h" : {
-#                 "mean": np.nanmean(data),
-#                 "min": np.nanmin(data),
-#                 "max_1": max1234_1h[0],
-#                 "max_2": max1234_1h[1],
-#                 "max_3": max1234_1h[2],
-#                 "max_4": max1234_1h[3],
-#                 "pct_25": np.nanpercentile(data, 25),
-#                 "pct_75": np.nanpercentile(data, 75),
-#                 "pct_98": np.nanpercentile(data, 98),
-#                 "days_max_gt_std": days_gt_std_1h,
-#             },
-#             "8h" : {
-#                 "mean": np.nanmean(mean_8h),
-#                 "min": np.nanmin(mean_8h),
-#                 "max_1": max1234_8h[0],
-#                 "max_2": max1234_8h[1],
-#                 "max_3": max1234_8h[2],
-#                 "max_4": max1234_8h[3],
-#                 "pct_25": np.nanpercentile(mean_8h, 25),
-#                 "pct_75": np.nanpercentile(mean_8h, 75),
-#                 "pct_98": np.nanpercentile(mean_8h, 98),
-#                 "days_max_gt_std": days_gt_std_8h,
-#             }
-#         }
-#     }
-    
-#     return stats
-    
-
-# def so2_stats(data):
-#     """
-#     Compute the EPA monitoring report values for SO2. Data should be in ppm.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-    
-#     # 1 hour statistics
-    
-#     #Highest and second-highest daily max 1-hour values
-#     max12_1h = find_max_n(data, 2)
-    
-#     # 99th percentile of daily max 1hr values
-#     pct_99_daily_max = percentile_of_daily_max(data, 99)
-    
-#     #Number of daily max 1-hour values that exceeded the level of the 1-hour standard
-#     days_gt_std_1h = days_max_gt_std(data, 0.075)
-    
-    
-#     #24 hour statistics
-#     mean_24h = data.resample('24h').mean()
-    
-#     #Highest and second-highest daily max 24-hour values
-#     max12_24h = find_max_n(mean_24h, 2)
-    
-#     #Number of daily max 24-hour values that exceeded the level of the 24-hour standard
-#     days_gt_std_24h = days_max_gt_std(mean_24h, 0.14)
-    
-    
-#     stats = {
-#         "so2" : {
-#             "1h" : {
-#                 "mean": np.nanmean(data),
-#                 "min": np.nanmin(data),
-#                 "max_1": max12_1h[0],
-#                 "max_2": max12_1h[1],
-#                 "pct_25": np.nanpercentile(data, 25),
-#                 "pct_75": np.nanpercentile(data, 75),
-#                 "pct_98": np.nanpercentile(data, 98),
-#                 "pct_99_daily_max": pct_99_daily_max,
-#                 "days_max_gt_std": days_gt_std_1h,
-#             },
-#             "24h" : {
-#                 "mean": np.nanmean(mean_24h),
-#                 "min": np.nanmin(mean_24h),
-#                 "max_1": max12_24h[0],
-#                 "max_2": max12_24h[1],
-#                 "pct_25": np.nanpercentile(mean_24h, 25),
-#                 "pct_75": np.nanpercentile(mean_24h, 75),
-#                 "pct_98": np.nanpercentile(mean_24h, 98),
-#                 "days_max_gt_std": days_gt_std_24h,
-#             }
-#         }
-#     }
-    
-#     return stats
-    
-
-# def pm25_stats(data):
-#     """
-#     Compute the EPA monitoring report values for PM 2.5. Data should be in micrograms per
-#     cubic meter.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-    
-#     #24 hour statistics
-#     mean_24h = data.resample('24h').mean()
-    
-#     #Highest, second-highest, third-highest, and fourth-highest daily max 1-hour values
-#     max1234_24h = find_max_n(mean_24h, 4)
-    
-#     stats = {
-#         "pm25": {
-#             "24h": {
-#                 "mean": np.nanmean(mean_24h),
-#                 "min": np.nanmin(mean_24h),
-#                 "max_1": max1234_24h[0],
-#                 "max_2": max1234_24h[1],
-#                 "max_3": max1234_24h[2],
-#                 "max_4": max1234_24h[3],
-#                 "pct_25": np.nanpercentile(mean_24h, 25),
-#                 "pct_75": np.nanpercentile(mean_24h, 75),
-#                 "pct_98": np.nanpercentile(mean_24h, 98),
-#             }
-#         }
-#     }
-    
-#     return stats
-
-
-# def pm10_stats(data):
-#     """
-#     Compute the EPA monitoring report values for PM 10. Data should be in micrograms per
-#     cubic meter.
-    
-#     Inputs
-#     ------
-#         :param data: dataframe or Series with timestamp index and pollution data column.
-#                     should be resampled to 1h timebase
-#         :type data: pd.DataFrame or pd.Series
-        
-#     Returns
-#     -------
-#         stats: dictionary of statistics
-#     """
-    
-#     #24 hour statistics
-#     mean_24h = data.resample('24h').mean()
-    
-#     #Highest and second-highest daily max 1-hour values
-#     max12_24h = find_max_n(mean_24h, 2)
-    
-#     #Number of daily max 24-hour values that exceeded the level of the 24-hour standard
-#     days_gt_std_24h = days_max_gt_std(mean_24h, 150)
-    
-#     stats = {
-#         "pm10": {
-#             "24h": {
-#                 "mean": np.nanmean(mean_24h),
-#                 "min": np.nanmin(mean_24h),
-#                 "max_1": max12_24h[0],
-#                 "max_2": max12_24h[1],
-#                 "pct_25": np.nanpercentile(mean_24h, 25),
-#                 "pct_75": np.nanpercentile(mean_24h, 75),
-#                 "pct_98": np.nanpercentile(mean_24h, 98),
-#                 "days_max_gt_std": days_gt_std_24h,
-#             }
-#         }
-#     }
-    
-#     return stats
-
-# ##### COMPUTING EPA CHOSEN STATS #####
-
-# POLLUTANTS = {
-#     'co': co_stats,
-#     'pb': pb_stats,
-#     'no2': no2_stats,
-#     'o3': o3_stats,
-#     'so2': so2_stats,
-#     'pm25': pm25_stats,
-#     'pm10': pm10_stats
-# }
-
-
-# def EPAstats(df, col, pollutants=['co', 'no2', 'o3', 'pm10', 'pm25', 'so2'], tscol=None):
-#     """
-#     Compute the EPA monitoring report statistics for chosen pollutants with any appropriate 
-#     time-series data. 
-#     EPA recognized pollutants:
-#         CO, Pb, NO2, O3, SO2, PM2.5, PM10
-    
-#     Inputs
-#     ------
-#         :param df: dataframe containing time series pollution data
-#         :type data: pandas Dataframe
-#         :param col: column name(s) within df that correspond(s) to chosen pollutants
-#         :type col: str or list of str
-        
-#     Optional Inputs
-#     ---------------
-#     :param pollutants: list of pollutants to be computed 
-#                         default options = ['co', 'no2', 'o3', 'pm10', 'pm25', 'so2']
-#     :type pollutants: str or list of str 
-#     :param tscol: column name within df that is the timestamp column. Must be pd.Datetime Series
-#                     If left as None, then timestamp is assumed to be df.index
-#     :type tscol: str
-    
-#     Returns
-#     -------
-#         z : dictionary of statistics
-        
-    
-#     References
-#     ----------
-#     EPA monitoring report statistics:
-#     https://www.epa.gov/outdoor-air-quality-data/about-air-data-reports#con
-    
-#     EPA Standard Pollution Levels:
-#     https://www.epa.gov/criteria-air-pollutants/naaqs-table
-    
-#     """
-#     if isinstance(col, str):
-#         col = list(col)
-    
-#     # If timestamp column isn't given, we assume the index to be DatetimeIndex
-#     if tscol is None:
-#         data = df[col].copy(deep=True)
-#     else:
-#         tscol = list(tscol)
-#         data = df[tscol+col].copy(deep=True)
-#         data.set_index(tscol, inplace=True)
-    
-#     data.index = pd.DatetimeIndex(data.index)
-#     data = data.resample('1h').mean()
-
-#     stats = {}
-#     for i,p in enumerate(col):
-#         temp = data[col[i]]
-#         stats.update(POLLUTANTS[p](temp))
-    
-#     reform = {(outerKey, innerKey): values for outerKey, innerDict in stats.items() for innerKey, values in innerDict.items()}
-
-#     return reform
diff --git a/atmospy/trends.py b/atmospy/trends.py
index b9b5799..db70559 100644
--- a/atmospy/trends.py
+++ b/atmospy/trends.py
@@ -6,9 +6,6 @@
 from .utils import (
     remove_na,
 )
-from seaborn import (
-    FacetGrid,
-)
 from .utils import (
     check_for_numeric_cols,
     check_for_timestamp_col
diff --git a/atmospy/utils.py b/atmospy/utils.py
index 5923217..6efb7e6 100644
--- a/atmospy/utils.py
+++ b/atmospy/utils.py
@@ -1,15 +1,13 @@
 """Utility functions for internal use."""
 
-"""Utility functions to add:
-  - load dataset: https://github.com/mwaskom/seaborn/blob/master/seaborn/utils.py#L524
-  - 
-"""
 import os
 from urllib.request import (
     urlopen,
     urlretrieve
 )
-from seaborn.external.appdirs import user_cache_dir
+from seaborn.external.appdirs import (
+    user_cache_dir,
+)
 import pandas as pd
 
 DATASET_SOURCE = "https://raw.githubusercontent.com/dhhagan/atmospy-data/main"
@@ -100,6 +98,12 @@ def load_dataset(name, cache=True, data_home=None, **kwargs):
         
     if name == "us-bc":
         df["Timestamp GMT"] = pd.to_datetime(df["Timestamp GMT"])
+        
+    if name == "air-sensors-pm":
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+    
+    if name == "air-sensors-met":
+        df["timestamp_local"] = pd.to_datetime(df["timestamp_local"])
     
     return df
 
@@ -132,33 +136,3 @@ def check_for_numeric_cols(data, cols):
     for col in cols:
         if not pd.core.dtypes.common.is_numeric_dtype(data[col]):
             raise TypeError(f"Column `{col}` is not numeric. Please convert to a numeric dtype before proceeding.")
-
-# def stats(data, **kwargs):
-#     """[summary]
-#     Calculate MAE, RMSE, MBE, R2 Score
-
-#     Accepts either a dataframe and column names, or two arrays
-
-#     Args:
-#         data ([type]): [description]
-#     """
-#     x = kwargs.pop("x")
-#     y = kwargs.pop("y")
-
-#     return
-
-# def epastats(data, **kwargs):
-#     """[summary]
-
-#     Args:
-#         data ([type]): [description]
-#     """
-#     return
-
-# def statstable(stats, **kwargs):
-#     """[summary]
-
-#     Args:
-#         stats ([type]): [description]
-#     """
-#     return
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 0ed01f3..7ea4857 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "atmospy"
-version = "0.1.0a2"
+version = "0.1.0a3"
 description = "Data analysis and visualization for air quality data."
 authors = ["David H Hagan <david.hagan@quant-aq.com>"]
 license = "Apache-2.0"
diff --git a/tests/test_rose.py b/tests/test_rose.py
new file mode 100644
index 0000000..8529960
--- /dev/null
+++ b/tests/test_rose.py
@@ -0,0 +1,17 @@
+"""Test the relational plots."""
+import pytest
+from atmospy import pollutionroseplot
+import pandas as pd
+import matplotlib as mpl
+from atmospy import load_dataset
+import seaborn as sns
+
+def test_scatter_basics():
+    df = load_dataset("air-sensors-met")
+    
+    ax = pollutionroseplot(
+        data=df, ws="ws", wd='wd', pollutant="pm1", suffix="ppb", 
+        segments=30, calm=0.1
+    )
+    
+    assert isinstance(ax, mpl.axes._axes.Axes)
\ No newline at end of file