diff --git a/.github/workflows/nbval.yaml b/.github/workflows/nbval.yaml index abc712ae..e014b494 100644 --- a/.github/workflows/nbval.yaml +++ b/.github/workflows/nbval.yaml @@ -29,7 +29,7 @@ jobs: - name: Run notebook and check output run: | # --sanitize-with: pre-process text to remove irrelevant differences (e.g. warning filepaths) - pytest --nbval --sanitize-with docs/nbval_sanitization_rules.cfg docs/${{ matrix.notebook-file }} + pytest --nbval docs/${{ matrix.notebook-file }} --sanitize-with docs/nbval_sanitization_rules.cfg - name: Run notebooks again, save files run: | pip install nbconvert[webpdf] diff --git a/.gitignore b/.gitignore index a289ac22..95b56ca8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ docs/sphinx/source/generated .eggs/ build/ dist/ +tmp/ rdtools.egg-info* # emacs temp files diff --git a/docs/sphinx/source/changelog/pending.rst b/docs/sphinx/source/changelog/pending.rst index 2d4ce63d..14d7a61a 100644 --- a/docs/sphinx/source/changelog/pending.rst +++ b/docs/sphinx/source/changelog/pending.rst @@ -12,7 +12,8 @@ Bug fixes --------- * Set marker linewidth to zero in `rdtools.plotting.degradation_summary_plots` (:pull:`433`) * Fix :py:func:`~rdtools.normalization.energy_from_power` returns incorrect index for shifted hourly data (:issue:`370`, :pull:`437`) -* Add warning to clearsky workflow when power_expected is passed by user (:pull:`439`) +* Add warning to clearsky workflow when ``power_expected`` is passed by user (:pull:`439`) +* Fix different results with Nan's and Zeros in power series (:issue:`313`, :pull:`442`) Requirements diff --git a/rdtools/__init__.py b/rdtools/__init__.py index 03ea0288..2ef39307 100644 --- a/rdtools/__init__.py +++ b/rdtools/__init__.py @@ -36,6 +36,9 @@ # from rdtools.plotting import soiling_rate_histogram # from rdtools.plotting import availability_summary_plots # from rdtools.availability import AvailabilityAnalysis +from rdtools.utilities import robust_quantile +from rdtools.utilities import robust_median +from rdtools.utilities import robust_mean from . import _version __version__ = _version.get_versions()['version'] diff --git a/rdtools/analysis_chains.py b/rdtools/analysis_chains.py index a3cfcc57..28179eab 100644 --- a/rdtools/analysis_chains.py +++ b/rdtools/analysis_chains.py @@ -8,7 +8,7 @@ import numpy as np import matplotlib.pyplot as plt from rdtools import normalization, filtering, aggregation, degradation -from rdtools import clearsky_temperature, plotting +from rdtools import clearsky_temperature, plotting, utilities import warnings @@ -493,8 +493,9 @@ def _pvwatts_norm(self, poa_global, temperature_cell): if renorm: # Normalize to the 95th percentile for convenience, this is renormalized out # in the calculations but is relevant to normalized_filter() - x = energy_normalized[np.isfinite(energy_normalized)] - energy_normalized = energy_normalized / x.quantile(0.95) + q = utilities.robust_quantile(energy_normalized[np.isfinite(energy_normalized)], 0.95) + + energy_normalized = energy_normalized / q return energy_normalized, insolation @@ -1012,7 +1013,6 @@ def sensor_analysis( ------- None """ - self._sensor_preprocess() sensor_results = {} diff --git a/rdtools/degradation.py b/rdtools/degradation.py index ba50476e..4bd9ba57 100644 --- a/rdtools/degradation.py +++ b/rdtools/degradation.py @@ -5,6 +5,7 @@ import statsmodels.api as sm from rdtools.bootstrap import _make_time_series_bootstrap_samples, \ _construct_confidence_intervals +from rdtools import utilities def degradation_ols(energy_normalized, confidence_level=68.2): @@ -259,7 +260,7 @@ def degradation_year_on_year(energy_normalized, recenter=True, if recenter: start = energy_normalized.index[0] oneyear = start + pd.Timedelta('364d') - renorm = energy_normalized[start:oneyear].median() + renorm = utilities.robust_median(energy_normalized[start:oneyear]) else: renorm = 1.0 diff --git a/rdtools/filtering.py b/rdtools/filtering.py index 6a065bae..7c3759d4 100644 --- a/rdtools/filtering.py +++ b/rdtools/filtering.py @@ -8,6 +8,7 @@ from scipy.interpolate import interp1d import rdtools import xgboost as xgb +from rdtools import utilities # Load in the XGBoost clipping model using joblib. xgboost_clipping_model = None @@ -294,7 +295,7 @@ def pvlib_clearsky_filter( def clip_filter(power_ac, model="logic", **kwargs): """ Master wrapper for running one of the desired clipping filters. - The default filter run is the quantile clipping filter. + The default filter run is the logic clipping filter. Parameters ---------- @@ -350,7 +351,7 @@ def quantile_clip_filter(power_ac, quantile=0.98): Boolean Series of whether the given measurement is below 99% of the quantile filter. """ - v = power_ac.quantile(quantile) + v = utilities.robust_quantile(power_ac, quantile) return power_ac < v * 0.99 @@ -510,13 +511,15 @@ def _apply_overall_clipping_threshold(power_ac, clipping_mask, clipped_power_ac) periods are labeled as True and non-clipping periods are labeled as False. Has a pandas datetime index. """ + q_power_ac = utilities.robust_quantile(power_ac, 0.99) + q_clipped_power_ac = utilities.robust_quantile(clipped_power_ac, 0.99) + upper_bound_pdiff = abs( - (power_ac.quantile(0.99) - clipped_power_ac.quantile(0.99)) - / ((power_ac.quantile(0.99) + clipped_power_ac.quantile(0.99)) / 2) + (q_power_ac - q_clipped_power_ac) / ((q_power_ac + q_clipped_power_ac) / 2) ) percent_clipped = len(clipped_power_ac) / len(power_ac) * 100 if (upper_bound_pdiff < 0.005) & (percent_clipped > 4): - max_clip = power_ac >= power_ac.quantile(0.99) + max_clip = power_ac >= q_power_ac clipping_mask = clipping_mask | max_clip return clipping_mask @@ -644,9 +647,11 @@ def logic_clip_filter( # for high frequency data sets. daily_mean = clip_pwr.resample("D").mean() df_daily = daily_mean.to_frame(name="mean") - df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile(0.99) - df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile( - 0.075 + df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg( + utilities.robust_quantile, q=0.99 + ) + df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg( + utilities.robust_quantile, q=0.075 ) daily_clipping_max = df_daily["clipping_max"].reindex( index=power_ac_copy.index, method="ffill" diff --git a/rdtools/test/analysis_chains_test.py b/rdtools/test/analysis_chains_test.py index 27305343..95caac68 100644 --- a/rdtools/test/analysis_chains_test.py +++ b/rdtools/test/analysis_chains_test.py @@ -68,6 +68,48 @@ def sensor_analysis(sensor_parameters): return rd_analysis +@pytest.fixture +def sensor_analysis_nans(sensor_parameters): + def randomly_replace_with(series, replace_with=0, fraction=0.1, seed=None): + """ + Randomly replace a fraction of entries in a pandas Series with input value `replace_with`. + + Parameters: + series (pd.Series): The input pandas Series. + fraction (float): The fraction of entries to replace with 0. Default is 0.1 (10%). + seed (int, optional): Seed for the random number generator for reproducibility. + + Returns: + pd.Series: The modified pandas Series with some entries replaced by 0. + """ + if seed is not None: + np.random.seed(seed) + + # Determine the number of entries to replace + n_replace = int(len(series) * fraction) + + # Randomly select indices to replace + replace_indices = np.random.choice(series.index, size=n_replace, replace=False) + + # Replace selected entries with + series.loc[replace_indices] = replace_with + + return series + + sensor_parameters_zeros = sensor_parameters.copy() + sensor_parameters_nans = sensor_parameters.copy() + + sensor_parameters_zeros["pv"] = randomly_replace_with(sensor_parameters["pv"], seed=0) + sensor_parameters_nans["pv"] = sensor_parameters_zeros["pv"].replace(0, np.nan) + + rd_analysis_zeros = TrendAnalysis(**sensor_parameters_zeros) + rd_analysis_zeros.sensor_analysis(analyses=["yoy_degradation"]) + + rd_analysis_nans = TrendAnalysis(**sensor_parameters_nans) + rd_analysis_nans.sensor_analysis(analyses=["yoy_degradation"]) + return rd_analysis_zeros, rd_analysis_nans + + @pytest.fixture def sensor_analysis_exp_power(sensor_parameters): power_expected = normalization.pvwatts_dc_power( @@ -209,6 +251,21 @@ def test_sensor_analysis(sensor_analysis): assert [-1, -1] == pytest.approx(ci, abs=1e-2) +def test_sensor_analysis_nans(sensor_analysis_nans): + rd_analysis_zeros, rd_analysis_nans = sensor_analysis_nans + + yoy_results_zeros = rd_analysis_zeros.results["sensor"]["yoy_degradation"] + rd_zeros = yoy_results_zeros["p50_rd"] + ci_zeros = yoy_results_zeros["rd_confidence_interval"] + + yoy_results_nans = rd_analysis_nans.results["sensor"]["yoy_degradation"] + rd_nans = yoy_results_nans["p50_rd"] + ci_nans = yoy_results_nans["rd_confidence_interval"] + + assert rd_zeros == pytest.approx(rd_nans, abs=1e-2) + assert ci_zeros == pytest.approx(ci_nans, abs=1e-1) + + def test_sensor_analysis_filter_components(sensor_analysis): columns = sensor_analysis.sensor_filter_components_aggregated.columns assert {'two_way_window_filter'} == set(columns) diff --git a/rdtools/test/utilities_test.py b/rdtools/test/utilities_test.py new file mode 100644 index 00000000..b20e7255 --- /dev/null +++ b/rdtools/test/utilities_test.py @@ -0,0 +1,43 @@ +import pandas as pd +import numpy as np +import pytest +from rdtools.utilities import robust_quantile, robust_median, robust_mean + + +@pytest.fixture +def data(): + data_zeros = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + data_nan = pd.Series([np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + return data_zeros, data_nan + + +def test_robust_quantile(data): + data_zeros, data_nan = data + quantile = 0.5 + expected_result = 5.5 + assert expected_result == robust_quantile(data_zeros, quantile) + assert expected_result == robust_quantile(data_nan, quantile) + + quantile = 0.25 + expected_result = 3.25 + assert expected_result == robust_quantile(data_zeros, quantile) + assert expected_result == robust_quantile(data_nan, quantile) + + quantile = 0.75 + expected_result = 7.75 + assert expected_result == robust_quantile(data_zeros, quantile) + assert expected_result == robust_quantile(data_nan, quantile) + + +def test_robust_median(data): + data_zeros, data_nan = data + expected_result = 5.5 + assert expected_result == robust_median(data_zeros) + assert expected_result == robust_median(data_nan) + + +def test_robust_mean(data): + data_zeros, data_nan = data + expected_result = 5.5 + assert expected_result == robust_mean(data_zeros) + assert expected_result == robust_mean(data_nan) diff --git a/rdtools/utilities.py b/rdtools/utilities.py new file mode 100644 index 00000000..c72e4174 --- /dev/null +++ b/rdtools/utilities.py @@ -0,0 +1,79 @@ +"""Utility functions for rdtools.""" + + +def robust_quantile(x, q): + """ + Compute the q-th quantile of a time series (x), ignoring small values and NaN's. + NaN's and small values [x < Q(x,q)/1000] are removed before calculating the quantile. + This function ensures that time series with NaN's and distributions without + NaN's return the same results. Should only be used if x is expected to be ≥0. + + Parameters + ---------- + x : pandas.Series + Input time series. + q : float + Probability value. + + Returns + ------- + quantile : float + The q-th quantile of x, ignoring small values and NaN's. + """ + + small = x.astype(float).fillna(0).quantile(q) / 1000 + q = x[x > small].quantile(q) + + return q + + +def robust_median(x, q=0.99): + """ + Compute the median of a time series (x), ignoring small values and NaN's. + NaN's and small values [Q(x,q)/1000] are removed before calculating the mean. + This function ensures that time series with NaN's and distributions without + NaN's return the same results. Should only be used if x is expected to be ≥0. + + Parameters + ---------- + x : pandas.Series + Input time series. + q : float, default 0.99 + Probability value to use for the small values threshold calculation [Q(x,q)/1000]. + + Returns + ------- + quantile : float + The q-th quantile of x, ignoring small values and NaN's. + """ + + small = x.astype(float).fillna(0).quantile(q) / 1000 + mdn = x[x > small].median() + + return mdn + + +def robust_mean(x, q=0.99): + """ + Compute the mean of a time series (x), ignoring small values and NaN's. + NaN's and small values [x < Q(x,q)/1000] are removed before calculating the mean. + This function ensures that time series with NaN's and distributions without + NaN's return the same results. Should only be used if x is expected to be ≥0. + + Parameters + ---------- + x : pandas.Series + Input time series. + q : float, default 0.99 + Probability value to use for the small values threshold calculation. + + Returns + ------- + quantile : float + The q-th quantile of x, ignoring small values and NaN's. + """ + + small = x.astype(float).fillna(0).quantile(q) / 1000 + m = x[x > small].mean() + + return m