diff --git a/docs/sphinx/source/changelog/pending.rst b/docs/sphinx/source/changelog/pending.rst index 2d4adc83..c905c088 100644 --- a/docs/sphinx/source/changelog/pending.rst +++ b/docs/sphinx/source/changelog/pending.rst @@ -20,6 +20,7 @@ Enhancements * Added a new wrapper function for clearsky filters (:pull:`412`) * Improve test coverage, especially for the newly added filter capabilities (:pull:`413`) * Added codecov.yml configuration file (:pull:`420`) +* Added new methods perfect_clean_complex and inferred_clean_complex which detects negative shifts and piecewise changes in the slope for soiling detection in :py:func:`~rdtools.soiling.soiling_srr`(:pull:`426`) * Availability module no longer considered experimental (:pull:`429`) * Add capability to seed the CircularBlockBootstrap (:pull:`429`) * Allow sub-daily aggregation in :py:func:`~rdtools.degradation.degradation_year_on_year` (:pull:`390`) @@ -31,11 +32,15 @@ Bug fixes * Deploy workflow was replaced with trusted publisher workflow for pypi (:pull:`427`) * Fix pandas 2.0.0 deprications and update syntax changes (:pull:`428`) * Fix numpy 2.0.0 deprications and update syntax changes (:pull:`428`) +* Fixed pylint bare except error for :py:func:`~rdtools.soiling.segmented_soiling_period` +in ``soiling.py`` (:pull:`432`) Tests ----- * Testing matrix was updated to include python = [3.9, 3.10, 3.11, 3.12] (:pull:`428`) * nbval sanitization rules were added for date and time stamp (:pull:`428`) +* Added pytests to cover invalid segementations for +:py:func:`~rdtools.soiling.segmented_soiling_period` in ``soiling_cods_test.py`` (:pull:`432`) Documentation ------------- @@ -184,3 +189,4 @@ Contributors * Martin Springer (:ghuser:`martin-springer`) * Michael Deceglie (:ghuser:`mdeceglie`) * Kirsten Perry (:ghuser:`kperrynrel`) +* Quyen Nguyen (:ghuser:`qnguyen345`) diff --git a/docs/sphinx/source/changelog/v2.2.0-beta.2.rst b/docs/sphinx/source/changelog/v2.2.0-beta.2.rst index 32fa3da2..6f0dab31 100644 --- a/docs/sphinx/source/changelog/v2.2.0-beta.2.rst +++ b/docs/sphinx/source/changelog/v2.2.0-beta.2.rst @@ -20,4 +20,4 @@ Requirements Contributors ------------ * Martin Springer (:ghuser:`martin-springer`) -* Michael Deceglie (:ghuser:`mdeceglie`) \ No newline at end of file +* Michael Deceglie (:ghuser:`mdeceglie`) diff --git a/rdtools/soiling.py b/rdtools/soiling.py index 92a3bfb8..c09cc039 100644 --- a/rdtools/soiling.py +++ b/rdtools/soiling.py @@ -1,44 +1,47 @@ -''' +""" Functions for calculating soiling metrics from photovoltaic system data. The soiling module is currently experimental. The API, results, and default behaviors may change in future releases (including MINOR and PATCH releases) as the code matures. -''' -from rdtools import degradation as RdToolsDeg -from rdtools.bootstrap import _make_time_series_bootstrap_samples +""" +import bisect +import itertools +import sys +import time import warnings -import pandas as pd import numpy as np -from scipy.stats.mstats import theilslopes -from filterpy.kalman import KalmanFilter +import pandas as pd +import scipy.stats as st +import statsmodels.api as sm from filterpy.common import Q_discrete_white_noise -import itertools -import bisect -import time -import sys +from filterpy.kalman import KalmanFilter +from scipy.optimize import curve_fit +from scipy.stats.mstats import theilslopes from statsmodels.tsa.seasonal import STL from statsmodels.tsa.stattools import adfuller -import statsmodels.api as sm -lowess = sm.nonparametric.lowess -warnings.warn( - 'The soiling module is currently experimental. The API, results, ' - 'and default behaviors may change in future releases (including MINOR ' - 'and PATCH releases) as the code matures.' -) +from rdtools import degradation as RdToolsDeg +from rdtools.bootstrap import _make_time_series_bootstrap_samples + +lowess = sm.nonparametric.lowess # Used in CODSAnalysis/Matt + +warnings.warn("The soiling module is currently experimental. The API, results, " + "and default behaviors may change in future releases (including MINOR " + "and PATCH releases) as the code matures.") # Custom exception class NoValidIntervalError(Exception): - '''raised when no valid rows appear in the result dataframe''' + """raised when no valid rows appear in the result dataframe""" + pass -class SRRAnalysis(): - ''' +class SRRAnalysis: + """ Class for running the stochastic rate and recovery (SRR) photovoltaic soiling loss analysis presented in Deceglie et al. JPV 8(2) p547 2018 @@ -55,10 +58,9 @@ class SRRAnalysis(): precipitation_daily : pandas.Series, default None Daily total precipitation. (Ignored if ``clean_criterion='shift'`` in subsequent calculations.) - ''' + """ - def __init__(self, energy_normalized_daily, insolation_daily, - precipitation_daily=None): + def __init__(self, energy_normalized_daily, insolation_daily, precipitation_daily=None): self.pm = energy_normalized_daily # daily performance metric self.insolation_daily = insolation_daily self.precipitation_daily = precipitation_daily # daily precipitation @@ -66,23 +68,22 @@ def __init__(self, energy_normalized_daily, insolation_daily, # insolation-weighted soiling ratios in _calc_monte: self.monte_losses = [] - if pd.infer_freq(self.pm.index) != 'D': - raise ValueError('Daily performance metric series must have ' - 'daily frequency') + if pd.infer_freq(self.pm.index) != "D": + raise ValueError("Daily performance metric series must have " "daily frequency") - if pd.infer_freq(self.insolation_daily.index) != 'D': - raise ValueError('Daily insolation series must have ' - 'daily frequency') + if pd.infer_freq(self.insolation_daily.index) != "D": + raise ValueError("Daily insolation series must have " "daily frequency") if self.precipitation_daily is not None: - if pd.infer_freq(self.precipitation_daily.index) != 'D': - raise ValueError('Precipitation series must have ' - 'daily frequency') - - def _calc_daily_df(self, day_scale=13, clean_threshold='infer', - recenter=True, clean_criterion='shift', precip_threshold=0.01, - outlier_factor=1.5): - ''' + if pd.infer_freq(self.precipitation_daily.index) != "D": + raise ValueError("Precipitation series must have " "daily frequency") + + ############################################################################### + # add neg_shift and piecewise into parameters/Matt + def _calc_daily_df(self, day_scale=13, clean_threshold="infer", recenter=True, + clean_criterion="shift", precip_threshold=0.01, outlier_factor=1.5, + neg_shift=False, piecewise=False): + """ Calculates self.daily_df, a pandas dataframe prepared for SRR analysis, and self.renorm_factor, the renormalization factor for the daily performance @@ -118,26 +119,39 @@ def _calc_daily_df(self, day_scale=13, clean_threshold='infer', The factor used in the Tukey fence definition of outliers for flagging positive shifts in the rolling median used for cleaning detection. A smaller value will cause more and smaller shifts to be classified as cleaning events. - ''' - if (day_scale % 2 == 0) and ('shift' in clean_criterion): - warnings.warn('An even value of day_scale was passed. An odd value is ' - 'recommended, otherwise, consecutive days may be erroneously ' - 'flagged as cleaning events. ' - 'See https://github.com/NREL/rdtools/issues/189') + neg_shift : bool, default True + where True results in additional subdividing of soiling intervals + when negative shifts are found in the rolling median of the performance + metric. Inferred corrections in the soiling fit are made at these + negative shifts. False results in no additional subdivides of the + data where excessive negative shifts can invalidate a soiling interval. + piecewise : bool, default True + where True results in each soiling interval of sufficient length + being tested for significant fit improvement with 2 piecewise linear + fits. If the criteria of significance is met the soiling interval is + subdivided into the 2 separate intervals. False results in no + piecewise fit being tested. + """ + if (day_scale % 2 == 0) and ("shift" in clean_criterion): + warnings.warn("An even value of day_scale was passed. An odd value is " + "recommended, otherwise, consecutive days may be erroneously " + "flagged as cleaning events. " + "See https://github.com/NREL/rdtools/issues/189") df = self.pm.to_frame() - df.columns = ['pi'] + df.columns = ["pi"] df_insol = self.insolation_daily.to_frame() - df_insol.columns = ['insol'] + df_insol.columns = ["insol"] df = df.join(df_insol) precip = self.precipitation_daily + if precip is not None: df_precip = precip.to_frame() - df_precip.columns = ['precip'] + df_precip.columns = ["precip"] df = df.join(df_precip) else: - df['precip'] = 0 + df["precip"] = 0 # find first and last valid data point start = df[~df.pi.isnull()].index[0] @@ -145,16 +159,16 @@ def _calc_daily_df(self, day_scale=13, clean_threshold='infer', df = df[start:end] # create a day count column - df['day'] = range(len(df)) + df["day"] = range(len(df)) # Recenter to median of first year, as in YoY degradation if recenter: - oneyear = start + pd.Timedelta('364d') - renorm = df.loc[start:oneyear, 'pi'].median() + oneyear = start + pd.Timedelta("364d") + renorm = df.loc[start:oneyear, "pi"].median() else: renorm = 1 - df['pi_norm'] = df['pi'] / renorm + df["pi_norm"] = df["pi"] / renorm # Find the beginning and ends of outages longer than dayscale bfill = df['pi_norm'].bfill(limit=day_scale) @@ -171,50 +185,110 @@ def _calc_daily_df(self, day_scale=13, clean_threshold='infer', df_ffill = df.ffill(limit=day_scale).copy() # Calculate rolling median - df['pi_roll_med'] = \ - df_ffill.pi_norm.rolling(day_scale, center=True).median() + df["pi_roll_med"] = df_ffill.pi_norm.rolling(day_scale, center=True).median() # Detect steps in rolling median - df['delta'] = df.pi_roll_med.diff() - if clean_threshold == 'infer': + df["delta"] = df.pi_roll_med.diff() + if clean_threshold == "infer": deltas = abs(df.delta) - clean_threshold = deltas.quantile(0.75) + \ - outlier_factor * (deltas.quantile(0.75) - deltas.quantile(0.25)) + clean_threshold = deltas.quantile(0.75) + outlier_factor * ( + deltas.quantile(0.75) - deltas.quantile(0.25) + ) + + df["clean_event_detected"] = df.delta > clean_threshold - df['clean_event_detected'] = (df.delta > clean_threshold) - precip_event = (df['precip'] > precip_threshold) + ########################################################################## + # Matt added these lines but the function "_collapse_cleaning_events" + # was written by Asmund, it reduces multiple days of cleaning events + # in a row to a single event + if piecewise is True: + reduced_cleaning_events = _collapse_cleaning_events( + df.clean_event_detected, df.delta.values, 5 + ) + df["clean_event_detected"] = reduced_cleaning_events + + ########################################################################## + precip_event = df["precip"] > precip_threshold - if clean_criterion == 'precip_and_shift': + if clean_criterion == "precip_and_shift": # Detect which cleaning events are associated with rain # within a 3 day window - precip_event = precip_event.rolling( - 3, center=True, min_periods=1).apply(any).astype(bool) - df['clean_event'] = (df['clean_event_detected'] & precip_event) - elif clean_criterion == 'precip_or_shift': - df['clean_event'] = (df['clean_event_detected'] | precip_event) - elif clean_criterion == 'precip': - df['clean_event'] = precip_event - elif clean_criterion == 'shift': - df['clean_event'] = df['clean_event_detected'] + precip_event = ( + precip_event.rolling(3, center=True, min_periods=1).apply(any).astype(bool)) + df["clean_event"] = df["clean_event_detected"] & precip_event + elif clean_criterion == "precip_or_shift": + df["clean_event"] = df["clean_event_detected"] | precip_event + elif clean_criterion == "precip": + df["clean_event"] = precip_event + elif clean_criterion == "shift": + df["clean_event"] = df["clean_event_detected"] else: - raise ValueError('clean_criterion must be one of ' - '{"precip_and_shift", "precip_or_shift", ' - '"precip", "shift"}') + raise ValueError("clean_criterion must be one of" + '{"precip_and_shift", "precip_or_shift", "precip", "shift"}') + + df["clean_event"] = df.clean_event | out_start | out_end - df['clean_event'] = df.clean_event | out_start | out_end + ####################################################################### + # add negative shifts which allows further segmentation of the soiling + # intervals and handles correction for data outages/Matt + df.delta = df.delta.fillna(0) # to avoid NA corrupting calculation + if neg_shift is True: + df["drop_event"] = df.delta < -2.5 * clean_threshold + df["break_event"] = df.clean_event | df.drop_event + else: + df["break_event"] = df.clean_event.copy() - df = df.fillna(0) + ####################################################################### + # This happens earlier than in the original code but is necessary + # for adding piecewise breakpoints/Matt # Give an index to each soiling interval/run - df['run'] = df.clean_event.cumsum() - df.index.name = 'date' # this gets used by name + df["run"] = df.break_event.cumsum() + df.index.name = "date" # this gets used by name + ####################################################################### + # df.fillna(0) /remove as the zeros introduced in pi_norm negatively + # impact various fits in the code, I havent yet found the original purpose + # or a failure due to removing/Matt + + ##################################################################### + # piecewise=True enables adding a single breakpoint per soiling intervals + # if statistical criteria are met with the piecewise linear fit + # compared to a single linear fit. Intervals <45 days reqire more + # stringent statistical improvements/Matt + if piecewise is True: + warnings.warn("Piecewise = True was passed, for both Piecewise=True" + "and neg_shift=True cleaning_method choices should" + "be perfect_clean_complex or inferred_clean_complex") + min_soil_length = 27 # min threshold of days necessary for piecewise fit + piecewise_loop = sorted(list(set(df["run"]))) + cp_dates = [] + for r in piecewise_loop: + run = df[df["run"] == r] + pr = run.pi_norm.copy() + pr = pr.ffill() # linear fitting cant handle nans + pr = pr.bfill() # catch first position nan + if len(run) > min_soil_length and run.pi_norm.sum() > 0: + sr, cp_date = segmented_soiling_period(pr, days_clean_vs_cp=13) + if cp_date is not None: + cp_dates.append(pr.index[cp_date]) + # save changes to df, note I would like to rename "clean_event" from + # original code to something like "break_event + df["slope_change_event"] = df.index.isin(cp_dates) + df["break_event"] = df.break_event | df.slope_change_event + df["run"] = df.break_event.cumsum() + else: + df["slope_change_event"] = False + + ###################################################################### self.renorm_factor = renorm self.daily_df = df - def _calc_result_df(self, trim=False, max_relative_slope_error=500.0, - max_negative_step=0.05, min_interval_length=7): - ''' + ###################################################################### + # added neg_shift into parameters in the following def/Matt + def _calc_result_df(self, trim=False, max_relative_slope_error=500.0, max_negative_step=0.05, + min_interval_length=7, neg_shift=False, piecewise=False): + """ Calculates self.result_df, a pandas dataframe summarizing the soiling intervals identified and self.analyzed_daily_df, a version of self.daily_df with additional columns calculated during analysis. @@ -234,13 +308,19 @@ def _calc_result_df(self, trim=False, max_relative_slope_error=500.0, min_interval_length : int, default 7 The minimum duration for an interval to be considered valid. Cannot be less than 2 (days). - ''' + neg_shift : bool, default True + where True results in additional subdividing of soiling intervals + when negative shifts are found in the rolling median of the performance + metric. Inferred corrections in the soiling fit are made at these + negative shifts. False results in no additional subdivides of the + data where excessive negative shifts can invalidate a soiling interval. + """ daily_df = self.daily_df result_list = [] if trim: # ignore first and last interval - res_loop = sorted(list(set(daily_df['run'])))[1:-1] + res_loop = sorted(list(set(daily_df["run"])))[1:-1] else: res_loop = sorted(list(set(daily_df['run']))) @@ -257,98 +337,269 @@ def _calc_result_df(self, trim=False, max_relative_slope_error=500.0, # valid=False row if not run_filtered.empty: run = run_filtered + #################################################################### + # see commented changes result_dict = { - 'start': start, - 'end': end, - 'length': length, - 'run': r, - 'run_slope': 0, - 'run_slope_low': 0, - 'run_slope_high': 0, - 'max_neg_step': min(run.delta), - 'start_loss': 1, - 'inferred_start_loss': run.pi_norm.mean(), - 'inferred_end_loss': run.pi_norm.mean(), - 'valid': False + "start": start, + "end": end, + "length": length, + "run": r, + "run_slope": 0, + "run_slope_low": 0, + "run_slope_high": 0, + "max_neg_step": min(run.delta), + "start_loss": 1, + "inferred_start_loss": run.pi_norm.median(), # changed from mean/Matt + "inferred_end_loss": run.pi_norm.median(), # changed from mean/Matt + "slope_err": 10000, # added high dummy start value for later logic/Matt + "valid": False, + "clean_event": run.clean_event.iloc[0], # record of clean events to distiguisih + # from other breaks/Matt + "run_loss_baseline": 0.0, # loss from the polyfit over the soiling intercal/Matt + ############################################################## } if len(run) > min_interval_length and run.pi_norm.sum() > 0: fit = theilslopes(run.pi_norm, run.day) fit_poly = np.poly1d(fit[0:2]) - result_dict['run_slope'] = fit[0] - result_dict['run_slope_low'] = fit[2] - result_dict['run_slope_high'] = min([0.0, fit[3]]) - result_dict['inferred_start_loss'] = fit_poly(start_day) - result_dict['inferred_end_loss'] = fit_poly(end_day) - result_dict['valid'] = True + result_dict["run_slope"] = fit[0] + result_dict["run_slope_low"] = fit[2] + result_dict["run_slope_high"] = min([0.0, fit[3]]) + result_dict["valid"] = True + ######################################################## + # moved the following 2 line to the next section within conditional statement/Matt + # result_dict['inferred_start_loss'] = fit_poly(start_day) + # result_dict['inferred_end_loss'] = fit_poly(end_day) + + #################################################### + # the following is moved here so median values are retained/Matt + # for soiling inferrences when rejected fits occur + + result_dict["slope_err"] = ( + result_dict["run_slope_high"] - result_dict["run_slope_low"] + ) / abs(result_dict["run_slope"]) + + if (result_dict["slope_err"] <= (max_relative_slope_error / 100.0)) & ( + result_dict["run_slope"] < 0): + result_dict["inferred_start_loss"] = fit_poly(start_day) + result_dict["inferred_end_loss"] = fit_poly(end_day) + ############################################# + # calculate loss over soiling interval per polyfit/matt + result_dict["run_loss_baseline"] = ( + result_dict["inferred_start_loss"] - result_dict["inferred_end_loss"]) + + ############################################### + result_list.append(result_dict) results = pd.DataFrame(result_list) if results.empty: - raise NoValidIntervalError('No valid soiling intervals were found') + raise NoValidIntervalError("No valid soiling intervals were found") + """ # Filter results for each interval, # setting invalid interval to slope of 0 + #moved above to line 356/Matt results['slope_err'] = ( results.run_slope_high - results.run_slope_low)\ / abs(results.run_slope) - # critera for exclusions - filt = ( - (results.run_slope > 0) | - (results.slope_err >= max_relative_slope_error / 100.0) | - (results.max_neg_step <= -1.0 * max_negative_step) - ) - - results.loc[filt, 'run_slope'] = 0 - results.loc[filt, 'run_slope_low'] = 0 - results.loc[filt, 'run_slope_high'] = 0 - results.loc[filt, 'valid'] = False + """ + ############################################################### + # negative shifts are now used as breaks for soiling intervals/Matt + # so new criteria for final filter to modify dataframe + if neg_shift is True: + warnings.warn("neg_shift = True was passed, for both Piecewise=True" + "and neg_shift=True cleaning_method choices should" + "be perfect_clean_complex or inferred_clean_complex") + filt = ((results.run_slope > 0) + | (results.slope_err >= max_relative_slope_error / 100.0) + # |(results.max_neg_step <= -1.0 * max_negative_step) + ) + + results.loc[filt, "run_slope"] = 0 + results.loc[filt, "run_slope_low"] = 0 + results.loc[filt, "run_slope_high"] = 0 + # only intervals that are now not valid are those that dont meet + # the minimum inteval length or have no data + # results.loc[filt, 'valid'] = False + ################################################################## + # original code below setting soiling intervals with extreme negative + # shift to zero slopes, /Matt + if neg_shift is False: + filt = ((results.run_slope > 0) + | (results.slope_err >= max_relative_slope_error / 100.0) + | (results.max_neg_step <= -1.0 * max_negative_step) + # remove line 389, want to store data for inferred values + # for calculations below + # |results.loc[filt, 'valid'] = False + ) + results.loc[filt, "run_slope"] = 0 + results.loc[filt, "run_slope_low"] = 0 + results.loc[filt, "run_slope_high"] = 0 + # results.loc[filt, "valid"] = False # Calculate the next inferred start loss from next valid interval - results['next_inferred_start_loss'] = np.clip( - results[results.valid].inferred_start_loss.shift(-1), - 0, 1) + results["next_inferred_start_loss"] = np.clip( + results[results.valid].inferred_start_loss.shift(-1), 0, 1) + # Calculate the inferred recovery at the end of each interval - results['inferred_recovery'] = np.clip( - results.next_inferred_start_loss - results.inferred_end_loss, - 0, 1) + ######################################################################## + # remove clipping on 'inferred_recovery' so absolute recovery can be + # used in later step where clipping can be considered/Matt + results["inferred_recovery"] = results.next_inferred_start_loss - results.inferred_end_loss + + ######################################################################## + # calculate beginning inferred shift (end of previous soiling period + # to start of current period)/Matt + results["prev_end"] = results[results.valid].inferred_end_loss.shift(1) + # if the current interval starts with a clean event, the previous end + # is a nan, and the current interval is valid then set prev_end=1 + results.loc[ + (results.clean_event is True) & (np.isnan(results.prev_end) & (results.valid is True)), + "prev_end"] = 1 # clean_event or clean_event_detected + results["inferred_begin_shift"] = results.inferred_start_loss - results.prev_end + # if orginal shift detection was positive the shift should not be + # negative due to fitting results + results.loc[results.clean_event, "inferred_begin_shift"] = np.clip( + results.inferred_begin_shift, 0, 1) + ####################################################################### + + if neg_shift is False: + results.loc[filt, "valid"] = False if len(results[results.valid]) == 0: - raise NoValidIntervalError('No valid soiling intervals were found') + raise NoValidIntervalError("No valid soiling intervals were found") new_start = results.start.iloc[0] new_end = results.end.iloc[-1] pm_frame_out = daily_df[new_start:new_end] - pm_frame_out = pm_frame_out.reset_index() \ - .merge(results, how='left', on='run') \ - .set_index('date') - - pm_frame_out['loss_perfect_clean'] = np.nan - pm_frame_out['loss_inferred_clean'] = np.nan - pm_frame_out['days_since_clean'] = \ - (pm_frame_out.index - pm_frame_out.start).dt.days - - # Calculate the daily derate - pm_frame_out['loss_perfect_clean'] = \ - pm_frame_out.start_loss + \ - pm_frame_out.days_since_clean * pm_frame_out.run_slope - # filling the flat intervals may need to be recalculated - # for different assumptions - pm_frame_out.loss_perfect_clean = \ - pm_frame_out.loss_perfect_clean.fillna(1) - - pm_frame_out['loss_inferred_clean'] = \ - pm_frame_out.inferred_start_loss + \ - pm_frame_out.days_since_clean * pm_frame_out.run_slope - # filling the flat intervals may need to be recalculated - # for different assumptions - pm_frame_out.loss_inferred_clean = \ - pm_frame_out.loss_inferred_clean.fillna(1) + pm_frame_out = ( + pm_frame_out.reset_index().merge(results, how="left", on="run").set_index("date")) + pm_frame_out["loss_perfect_clean"] = np.nan + pm_frame_out["loss_inferred_clean"] = np.nan + pm_frame_out["days_since_clean"] = (pm_frame_out.index - pm_frame_out.start).dt.days + + ####################################################################### + # new code for perfect and inferred clean with handling of/Matt + # negative shifts and changepoints within soiling intervals + # goes to line 563 + if (piecewise) | (neg_shift): + ################################################################### + pm_frame_out.inferred_begin_shift.bfill(inplace=True) + pm_frame_out["forward_median"] = ( + pm_frame_out.pi.iloc[::-1].rolling(10, min_periods=5).median()) + prev_shift = 1 + soil_inferred_clean = [] + soil_perfect_clean = [] + day_start = -1 + start_infer = 1 + start_perfect = 1 + soil_infer = 1 + soil_perfect = 1 + total_down = 0 + shift = 0 + shift_perfect = 0 + begin_perfect_shifts = [0] + begin_infer_shifts = [0] + + for date, rs, d, start_shift, changepoint, forward_median in zip( + pm_frame_out.index, pm_frame_out.run_slope, pm_frame_out.days_since_clean, + pm_frame_out.inferred_begin_shift, pm_frame_out.slope_change_event, + pm_frame_out.forward_median): + new_soil = d - day_start + day_start = d + + if new_soil <= 0: # begin new soil period + if (start_shift == prev_shift) | (changepoint): # no shift at + # a slope changepoint + shift = 0 + shift_perfect = 0 + else: + if (start_shift < 0) & (prev_shift < 0): # (both negative) or + # downward shifts to start last 2 intervals + shift = 0 + shift_perfect = 0 + total_down = total_down + start_shift # adding total downshifts + # to subtract from an eventual cleaning event + elif (start_shift > 0) & (prev_shift >= 0): # (both positive) or + # cleanings start the last 2 intervals + shift = start_shift + shift_perfect = 1 + total_down = 0 + # add #####################3/27/24 + elif (start_shift == 0) & (prev_shift >= 0): + shift = start_shift + shift_perfect = start_shift + total_down = 0 + ############################################################# + elif (start_shift >= 0) & (prev_shift < 0): # cleaning starts the current + # interval but there was a previous downshift + shift = start_shift + total_down # correct for the negative shifts + shift_perfect = shift # dont set to one 1 if correcting for a + # downshift (debateable alternative set to 1) + total_down = 0 + elif (start_shift < 0) & (prev_shift >= 0): + # negative shift starts the interval, previous shift was cleaning + shift = 0 + shift_perfect = 0 + total_down = start_shift + # check that shifts results in being at or above the median of + # the next 10 days of data + # this catches places where start points of polyfits were + # skewed below where data start + if (soil_infer + shift) < forward_median: + shift = forward_median - soil_infer + if (soil_perfect + shift_perfect) < forward_median: + shift_perfect = forward_median - soil_perfect + + # append the daily soiling ratio to each modeled fit + begin_perfect_shifts.append(shift_perfect) + begin_infer_shifts.append(shift) + # clip to last value in case shift ends up negative + soil_infer = np.clip((soil_infer + shift), soil_infer, 1) + start_infer = soil_infer # make next start value the last inferred value + soil_inferred_clean.append(soil_infer) + # clip to last value in case shift ends up negative + soil_perfect = np.clip((soil_perfect + shift_perfect), soil_perfect, 1) + start_perfect = soil_perfect + soil_perfect_clean.append(soil_perfect) + if changepoint is False: + prev_shift = start_shift # assigned at new soil period + + elif new_soil > 0: # within soiling period + # append the daily soiling ratio to each modeled fit + soil_infer = start_infer + rs * d + soil_inferred_clean.append(soil_infer) + + soil_perfect = start_perfect + rs * d + soil_perfect_clean.append(soil_perfect) + + pm_frame_out["loss_inferred_clean"] = pd.Series( + soil_inferred_clean, index=pm_frame_out.index) + pm_frame_out["loss_perfect_clean"] = pd.Series( + soil_perfect_clean, index=pm_frame_out.index) + + results["begin_perfect_shift"] = pd.Series(begin_perfect_shifts) + results["begin_infer_shift"] = pd.Series(begin_infer_shifts) + else: + pm_frame_out['loss_perfect_clean'] = pm_frame_out.start_loss + \ + pm_frame_out.days_since_clean * pm_frame_out.run_slope + # filling the flat intervals may need to be recalculated + # for different assumptions + pm_frame_out.loss_perfect_clean = pm_frame_out.loss_perfect_clean.fillna(1) + # inferred_start_loss was set to the value from poly fit at the beginning of the + # soiling interval + pm_frame_out['loss_inferred_clean'] = pm_frame_out.inferred_start_loss + \ + pm_frame_out.days_since_clean * pm_frame_out.run_slope + # filling the flat intervals may need to be recalculated + # for different assumptions + pm_frame_out.loss_inferred_clean = pm_frame_out.loss_inferred_clean.fillna(1) + ####################################################################### self.result_df = results self.analyzed_daily_df = pm_frame_out - def _calc_monte(self, monte, method='half_norm_clean'): - ''' + def _calc_monte(self, monte, method="half_norm_clean"): + """ Runs the Monte Carlo step of the SRR method. Calculates self.random_profiles, a list of the random soiling profiles realized in the calculation, and self.monte_losses, a list of the @@ -358,47 +609,60 @@ def _calc_monte(self, monte, method='half_norm_clean'): ---------- monte : int number of Monte Carlo simulations to run - method : str, {'half_norm_clean', 'random_clean', 'perfect_clean'} \ - default 'half_norm_clean' + method : str, {'half_norm_clean', 'random_clean', 'perfect_clean', + perfect_clean_complex,inferred_clean_complex} \ + default 'half_norm_clean' + How to treat the recovery of each cleaning event - * 'random_clean' - a random recovery between 0-100% + * 'random_clean' - a random recovery between 0-100%, + pair with piecewise=False and neg_shift=False * 'perfect_clean' - each cleaning event returns the performance - metric to 1 + metric to 1, + pair with piecewise=False and neg_shift=False * 'half_norm_clean' - The starting point of each interval is taken randomly from a half normal distribution with its mode (mu) at 1 and - its sigma equal to 1/3 * (1-b) where b is the intercept - of the fit to the interval. - ''' + its sigma equal to 1/3 * (1-b) where b is the intercept of the fit to + the interval, + pair with piecewise=False and neg_shift=False + *'perfect_clean_complex' - each detected clean event returns the + performance metric to 1 while negative shifts in the data or + piecewise linear fits result in no cleaning, + pair with piecewise=True and neg_shift=True + *'inferred_clean_complex' - at each detected clean event the + performance metric increases based on fits to the data while + negative shifts in the data or piecewise linear fits result in no + cleaning, + pair with piecewise=True and neg_shift=True + """ # Raise a warning if there is >20% invalid data - if (method == 'half_norm_clean') or (method == 'random_clean'): - valid_fraction = self.analyzed_daily_df['valid'].mean() + if ((method == "half_norm_clean") or (method == "random_clean") + or (method == "perfect_clean_complex") or (method == "inferred_clean_complex")): + valid_fraction = self.analyzed_daily_df["valid"].mean() if valid_fraction <= 0.8: warnings.warn('20% or more of the daily data is assigned to invalid soiling ' 'intervals. This can be problematic with the "half_norm_clean" ' 'and "random_clean" cleaning assumptions. Consider more permissive ' 'validity criteria such as increasing "max_relative_slope_error" ' - 'and/or "max_negative_step" and/or decreasing "min_interval_length".' - ' Alternatively, consider using method="perfect_clean". For more' - ' info see https://github.com/NREL/rdtools/issues/272' - ) + 'and/or "max_negative_step" and/or decreasing ' + '"min_interval_length". Alternatively, consider using ' + 'method="perfect_clean". For more info see ' + 'https://github.com/NREL/rdtools/issues/272') monte_losses = [] random_profiles = [] for _ in range(monte): results_rand = self.result_df.copy() df_rand = self.analyzed_daily_df.copy() # only really need this column from the original frame: - df_rand = df_rand[['insol', 'run']] - results_rand['run_slope'] = \ - np.random.uniform(results_rand.run_slope_low, - results_rand.run_slope_high) - results_rand['run_loss'] = \ - results_rand.run_slope * results_rand.length + df_rand = df_rand[["insol", "run"]] + results_rand["run_slope"] = np.random.uniform( + results_rand.run_slope_low, results_rand.run_slope_high) + results_rand["run_loss"] = results_rand.run_slope * results_rand.length - results_rand['end_loss'] = np.nan - results_rand['start_loss'] = np.nan + results_rand["end_loss"] = np.nan + results_rand["start_loss"] = np.nan # Make groups that start with a valid interval and contain # subsequent invalid intervals @@ -409,16 +673,19 @@ def _calc_monte(self, monte, method='half_norm_clean'): group += 1 group_list.append(group) - results_rand['group'] = group_list + results_rand["group"] = group_list # randomize the extent of the cleaning inter_start = 1.0 + delta_previous_run_loss = 0 start_list = [] - if (method == 'half_norm_clean') or (method == 'random_clean'): + if (method == "half_norm_clean") or (method == "random_clean"): # Randomize recovery of valid intervals only valid_intervals = results_rand[results_rand.valid].copy() - valid_intervals['inferred_recovery'] = \ - valid_intervals.inferred_recovery.fillna(1.0) + valid_intervals["inferred_recovery"] = np.clip( + valid_intervals.inferred_recovery, 0, 1) + valid_intervals["inferred_recovery"] = valid_intervals.inferred_recovery.fillna( + 1.0) end_list = [] for i, row in valid_intervals.iterrows(): @@ -426,18 +693,18 @@ def _calc_monte(self, monte, method='half_norm_clean'): end = inter_start + row.run_loss end_list.append(end) - if method == 'half_norm_clean': + if method == "half_norm_clean": # Use a half normal with the inferred clean at the # 3sigma point x = np.clip(end + row.inferred_recovery, 0, 1) inter_start = 1 - abs(np.random.normal(0.0, (1 - x) / 3)) - elif method == 'random_clean': + elif method == "random_clean": inter_start = np.random.uniform(end, 1) # Update the valid rows in results_rand valid_update = pd.DataFrame() - valid_update['start_loss'] = start_list - valid_update['end_loss'] = end_list + valid_update["start_loss"] = start_list + valid_update["end_loss"] = end_list valid_update.index = valid_intervals.index results_rand.update(valid_update) @@ -471,49 +738,83 @@ def _calc_monte(self, monte, method='half_norm_clean'): # Update results rand with the invalid rows replace_levels = np.concatenate(replace_levels) invalid_update = pd.DataFrame() - invalid_update['start_loss'] = replace_levels + invalid_update["start_loss"] = replace_levels invalid_update.index = invalid_intervals.index results_rand.update(invalid_update) - elif method == 'perfect_clean': + elif method == "perfect_clean": for i, row in results_rand.iterrows(): start_list.append(inter_start) end = inter_start + row.run_loss inter_start = 1 - results_rand['start_loss'] = start_list + results_rand["start_loss"] = start_list + ################################################################## + # matt additions + + elif method == "perfect_clean_complex": + for i, row in results_rand.iterrows(): + if row.begin_perfect_shift > 0: + inter_start = np.clip( + (inter_start + row.begin_perfect_shift + delta_previous_run_loss), + end, 1) + delta_previous_run_loss = -1 * row.run_loss - row.run_loss_baseline + else: + delta_previous_run_loss = ( + delta_previous_run_loss - 1 * row.run_loss - row.run_loss_baseline) + # inter_start=np.clip((inter_start+row.begin_shift+delta_previous_run_loss),0,1) + start_list.append(inter_start) + end = inter_start + row.run_loss + + inter_start = end + results_rand["start_loss"] = start_list + + elif method == "inferred_clean_complex": + for i, row in results_rand.iterrows(): + if row.begin_infer_shift > 0: + inter_start = np.clip( + (inter_start + row.begin_infer_shift + delta_previous_run_loss), + end, 1) + delta_previous_run_loss = -1 * row.run_loss - row.run_loss_baseline + else: + delta_previous_run_loss = ( + delta_previous_run_loss - 1 * row.run_loss - row.run_loss_baseline) + # inter_start=np.clip((inter_start+row.begin_shift+delta_previous_run_loss),0,1) + start_list.append(inter_start) + end = inter_start + row.run_loss + + inter_start = end + results_rand["start_loss"] = start_list + ############################################### else: raise ValueError("Invalid method specification") - df_rand = df_rand.reset_index() \ - .merge(results_rand, how='left', on='run') \ - .set_index('date') - df_rand['loss'] = np.nan - df_rand['days_since_clean'] = \ - (df_rand.index - df_rand.start).dt.days - df_rand['loss'] = df_rand.start_loss + \ - df_rand.days_since_clean * df_rand.run_slope + df_rand = ( + df_rand.reset_index().merge(results_rand, how="left", on="run").set_index("date")) + df_rand["loss"] = np.nan + df_rand["days_since_clean"] = (df_rand.index - df_rand.start).dt.days + df_rand["loss"] = df_rand.start_loss + df_rand.days_since_clean * df_rand.run_slope - df_rand['soil_insol'] = df_rand.loss * df_rand.insol + df_rand["soil_insol"] = df_rand.loss * df_rand.insol soiling_ratio = ( - df_rand.soil_insol.sum() / df_rand.insol[ - ~df_rand.soil_insol.isnull()].sum() - ) + df_rand.soil_insol.sum() / df_rand.insol[~df_rand.soil_insol.isnull()].sum()) monte_losses.append(soiling_ratio) - random_profile = df_rand['loss'].copy() - random_profile.name = 'stochastic_soiling_profile' + random_profile = df_rand["loss"].copy() + random_profile.name = "stochastic_soiling_profile" random_profiles.append(random_profile) self.random_profiles = random_profiles self.monte_losses = monte_losses - def run(self, reps=1000, day_scale=13, clean_threshold='infer', - trim=False, method='half_norm_clean', - clean_criterion='shift', precip_threshold=0.01, min_interval_length=7, - exceedance_prob=95.0, confidence_level=68.2, recenter=True, - max_relative_slope_error=500.0, max_negative_step=0.05, outlier_factor=1.5): - ''' + ####################################################################### + # add neg_shift and piecewise to the following def/Matt + def run(self, reps=1000, day_scale=13, clean_threshold="infer", trim=False, + method="half_norm_clean", clean_criterion="shift", precip_threshold=0.01, + min_interval_length=7, exceedance_prob=95.0, confidence_level=68.2, recenter=True, + max_relative_slope_error=500.0, max_negative_step=0.05, outlier_factor=1.5, + neg_shift=False, piecewise=False): + """ Run the SRR method from beginning to end. Perform the stochastic rate and recovery soiling loss calculation. Based on the methods presented in Deceglie et al. "Quantifying Soiling Loss Directly From PV Yield" @@ -534,17 +835,31 @@ def run(self, reps=1000, day_scale=13, clean_threshold='infer', trim : bool, default False Whether to trim (remove) the first and last soiling intervals to avoid inclusion of partial intervals - method : str, {'half_norm_clean', 'random_clean', 'perfect_clean'} \ - default 'half_norm_clean' + method : str, {'half_norm_clean', 'random_clean', 'perfect_clean', + perfect_clean_complex,inferred_clean_complex} \ + default 'perfect_clean_complex' + How to treat the recovery of each cleaning event - * 'random_clean' - a random recovery between 0-100% + * 'random_clean' - a random recovery between 0-100%, + pair with piecewise=False and neg_shift=False * 'perfect_clean' - each cleaning event returns the performance - metric to 1 + metric to 1, + pair with piecewise=False and neg_shift=False * 'half_norm_clean' - The starting point of each interval is taken randomly from a half normal distribution with its mode (mu) at 1 and its sigma equal to 1/3 * (1-b) where b is the intercept of the fit to - the interval. + the interval, + pair with piecewise=False and neg_shift=False + * 'perfect_clean_complex' - each detected clean event returns the + performance metric to 1 while negative shifts in the data or + piecewise linear fits result in no cleaning, + pair with piecewise=True and neg_shift=True + * 'inferred_clean_complex' - at each detected clean event the + performance metric increases based on fits to the data while + negative shifts in the data or piecewise linear fits result in no + cleaning, + pair with piecewise=True and neg_shift=True clean_criterion : str, {'shift', 'precip_and_shift', 'precip_or_shift', 'precip'} \ default 'shift' The method of partitioning the dataset into soiling intervals @@ -581,6 +896,18 @@ def run(self, reps=1000, day_scale=13, clean_threshold='infer', The factor used in the Tukey fence definition of outliers for flagging positive shifts in the rolling median used for cleaning detection. A smaller value will cause more and smaller shifts to be classified as cleaning events. + neg_shift : bool, default True + where True results in additional subdividing of soiling intervals + when negative shifts are found in the rolling median of the performance + metric. Inferred corrections in the soiling fit are made at these + negative shifts. False results in no additional subdivides of the + data where excessive negative shifts can invalidate a soiling interval. + piecewise : bool, default True + where True results in each soiling interval of sufficient length + being tested for significant fit improvement with 2 piecewise linear + fits. If the criteria of significance is met the soiling interval is + subdivided into the 2 separate intervals. False results in no + piecewise fit being tested. Returns ------- @@ -634,59 +961,58 @@ def run(self, reps=1000, day_scale=13, clean_threshold='infer', | | be treated as a valid soiling interval | +------------------------+----------------------------------------------+ - ''' - self._calc_daily_df(day_scale=day_scale, - clean_threshold=clean_threshold, - recenter=recenter, - clean_criterion=clean_criterion, - precip_threshold=precip_threshold, - outlier_factor=outlier_factor) - self._calc_result_df(trim=trim, - max_relative_slope_error=max_relative_slope_error, + """ + self._calc_daily_df(day_scale=day_scale, clean_threshold=clean_threshold, + recenter=recenter, clean_criterion=clean_criterion, + precip_threshold=precip_threshold, outlier_factor=outlier_factor, + neg_shift=neg_shift, piecewise=piecewise) + + self._calc_result_df(trim=trim, max_relative_slope_error=max_relative_slope_error, max_negative_step=max_negative_step, - min_interval_length=min_interval_length) + min_interval_length=min_interval_length, neg_shift=neg_shift, + piecewise=piecewise) + self._calc_monte(reps, method=method) # Calculate the P50 and confidence interval half_ci = confidence_level / 2.0 result = np.percentile(self.monte_losses, - [50, - 50.0 - half_ci, - 50.0 + half_ci, - 100 - exceedance_prob]) + [50, 50.0 - half_ci, 50.0 + half_ci, 100 - exceedance_prob]) P_level = result[3] # Construct calc_info output - + ############################################### + # add inferred_recovery, inferred_begin_shift /Matt + ############################################### intervals_out = self.result_df[ - ['start', 'end', 'run_slope', 'run_slope_low', - 'run_slope_high', 'inferred_start_loss', 'inferred_end_loss', - 'length', 'valid']].copy() - intervals_out.rename(columns={'run_slope': 'soiling_rate', - 'run_slope_high': 'soiling_rate_high', - 'run_slope_low': 'soiling_rate_low', - }, inplace=True) + ["start", "end", "run_slope", "run_slope_low", "run_slope_high", "inferred_start_loss", + "inferred_end_loss", "inferred_recovery", "inferred_begin_shift", "length", "valid"] + ].copy() + intervals_out.rename(columns={"run_slope": "soiling_rate", + "run_slope_high": "soiling_rate_high", + "run_slope_low": "soiling_rate_low"}, inplace=True) df_d = self.analyzed_daily_df - sr_perfect = df_d[df_d['valid']]['loss_perfect_clean'] - calc_info = { - 'exceedance_level': P_level, - 'renormalizing_factor': self.renorm_factor, - 'stochastic_soiling_profiles': self.random_profiles, - 'soiling_interval_summary': intervals_out, - 'soiling_ratio_perfect_clean': sr_perfect - } + # sr_perfect = df_d[df_d['valid']]['loss_perfect_clean'] + sr_perfect = df_d.loss_perfect_clean + + calc_info = {"exceedance_level": P_level, "renormalizing_factor": self.renorm_factor, + "stochastic_soiling_profiles": self.random_profiles, + "soiling_interval_summary": intervals_out, + "soiling_ratio_perfect_clean": sr_perfect} return (result[0], result[1:3], calc_info) -def soiling_srr(energy_normalized_daily, insolation_daily, reps=1000, - precipitation_daily=None, day_scale=13, clean_threshold='infer', - trim=False, method='half_norm_clean', - clean_criterion='shift', precip_threshold=0.01, min_interval_length=7, +# more updates are needed for documentation but added additional inputs +# that are in srr.run /Matt +def soiling_srr(energy_normalized_daily, insolation_daily, reps=1000, precipitation_daily=None, + day_scale=13, clean_threshold="infer", trim=False, method="half_norm_clean", + clean_criterion="shift", precip_threshold=0.01, min_interval_length=7, exceedance_prob=95.0, confidence_level=68.2, recenter=True, - max_relative_slope_error=500.0, max_negative_step=0.05, outlier_factor=1.5): - ''' + max_relative_slope_error=500.0, max_negative_step=0.05, outlier_factor=1.5, + neg_shift=False, piecewise=False): + """ Functional wrapper for :py:class:`~rdtools.soiling.SRRAnalysis`. Perform the stochastic rate and recovery soiling loss calculation. Based on the methods presented in Deceglie et al. JPV 8(2) p547 2018. @@ -718,17 +1044,31 @@ def soiling_srr(energy_normalized_daily, insolation_daily, reps=1000, trim : bool, default False Whether to trim (remove) the first and last soiling intervals to avoid inclusion of partial intervals - method : str, {'half_norm_clean', 'random_clean', 'perfect_clean'} \ + method : str, {'half_norm_clean', 'random_clean', 'perfect_clean', + perfect_clean_complex,inferred_clean_complex} \ default 'half_norm_clean' + How to treat the recovery of each cleaning event - * 'random_clean' - a random recovery between 0-100% + * 'random_clean' - a random recovery between 0-100%, + pair with piecewise=False and neg_shift=False * 'perfect_clean' - each cleaning event returns the performance - metric to 1 + metric to 1, + pair with piecewise=False and neg_shift=False * 'half_norm_clean' - The starting point of each interval is taken randomly from a half normal distribution with its mode (mu) at 1 and its sigma equal to 1/3 * (1-b) where b is the intercept of the fit to - the interval. + the interval, + pair with piecewise=False and neg_shift=False + *'perfect_clean_complex' - each detected clean event returns the + performance metric to 1 while negative shifts in the data or + piecewise linear fits result in no cleaning, + pair with piecewise=True and neg_shift=True + *'inferred_clean_complex' - at each detected clean event the + performance metric increases based on fits to the data while + negative shifts in the data or piecewise linear fits result in no + cleaning, + pair with piecewise=True and neg_shift=True clean_criterion : str, {'shift', 'precip_and_shift', 'precip_or_shift', 'precip'} \ default 'shift' The method of partitioning the dataset into soiling intervals @@ -764,6 +1104,18 @@ def soiling_srr(energy_normalized_daily, insolation_daily, reps=1000, The factor used in the Tukey fence definition of outliers for flagging positive shifts in the rolling median used for cleaning detection. A smaller value will cause more and smaller shifts to be classified as cleaning events. + neg_shift : bool, default True + where True results in additional subdividing of soiling intervals + when negative shifts are found in the rolling median of the performance + metric. Inferred corrections in the soiling fit are made at these + negative shifts. False results in no additional subdivides of the + data where excessive negative shifts can invalidate a soiling interval. + piecewise : bool, default True + where True results in each soiling interval of sufficient length + being tested for significant fit improvement with 2 piecewise linear + fits. If the criteria of significance is met the soiling interval is + subdivided into the 2 separate intervals. False results in no + piecewise fit being tested. Returns ------- @@ -816,34 +1168,24 @@ def soiling_srr(energy_normalized_daily, insolation_daily, reps=1000, | 'valid' | Whether the interval meets the criteria to | | | be treated as a valid soiling interval | +------------------------+----------------------------------------------+ - ''' + """ - srr = SRRAnalysis(energy_normalized_daily, - insolation_daily, + srr = SRRAnalysis(energy_normalized_daily, insolation_daily, precipitation_daily=precipitation_daily) sr, sr_ci, soiling_info = srr.run( - reps=reps, - day_scale=day_scale, - clean_threshold=clean_threshold, - trim=trim, - method=method, - clean_criterion=clean_criterion, - precip_threshold=precip_threshold, - min_interval_length=min_interval_length, - exceedance_prob=exceedance_prob, - confidence_level=confidence_level, - recenter=recenter, - max_relative_slope_error=max_relative_slope_error, - max_negative_step=max_negative_step, - outlier_factor=outlier_factor) - + reps=reps, day_scale=day_scale, clean_threshold=clean_threshold, trim=trim, + method=method, clean_criterion=clean_criterion, precip_threshold=precip_threshold, + min_interval_length=min_interval_length, exceedance_prob=exceedance_prob, + confidence_level=confidence_level, recenter=recenter, + max_relative_slope_error=max_relative_slope_error, max_negative_step=max_negative_step, + outlier_factor=outlier_factor, neg_shift=neg_shift, piecewise=piecewise) return sr, sr_ci, soiling_info def _count_month_days(start, end): - '''Return a dict of number of days between start and end - (inclusive) in each month''' + """Return a dict of number of days between start and end + (inclusive) in each month""" days = pd.date_range(start, end) months = [x.month for x in days] out_dict = {} @@ -853,9 +1195,8 @@ def _count_month_days(start, end): return out_dict -def annual_soiling_ratios(stochastic_soiling_profiles, - insolation_daily, confidence_level=68.2): - ''' +def annual_soiling_ratios(stochastic_soiling_profiles, insolation_daily, confidence_level=68.2): + """ Return annualized soiling ratios and associated confidence intervals based on stochastic soiling profiles from SRR. Note that each year may be affected by previous years' profiles for all SRR cleaning @@ -895,18 +1236,17 @@ def annual_soiling_ratios(stochastic_soiling_profiles, | | for insolation-weighted soiling ratio for | | | the year | +------------------------+-------------------------------------------+ - ''' + """ # Create a df with each realization as a column all_profiles = pd.concat(stochastic_soiling_profiles, axis=1) all_profiles = all_profiles.dropna() if not all_profiles.index.isin(insolation_daily.index).all(): - warnings.warn( - 'The indexes of stochastic_soiling_profiles are not entirely ' - 'contained within the index of insolation_daily. Every day in ' - 'stochastic_soiling_profiles should be represented in ' - 'insolation_daily. This may cause erroneous results.') + warnings.warn("The indexes of stochastic_soiling_profiles are not entirely " + "contained within the index of insolation_daily. Every day in " + "stochastic_soiling_profiles should be represented in " + "insolation_daily. This may cause erroneous results.") insolation_daily = insolation_daily.reindex(all_profiles.index) @@ -914,30 +1254,26 @@ def annual_soiling_ratios(stochastic_soiling_profiles, all_profiles_weighted = all_profiles.multiply(insolation_daily, axis=0) # Compute the insolation-weighted soiling ratio (IWSR) for each realization - annual_insolation = insolation_daily.groupby( - insolation_daily.index.year).sum() + annual_insolation = insolation_daily.groupby(insolation_daily.index.year).sum() all_annual_weighted_sums = all_profiles_weighted.groupby( - all_profiles_weighted.index.year).sum() - all_annual_iwsr = all_annual_weighted_sums.multiply( - 1/annual_insolation, axis=0) - - annual_soiling = pd.DataFrame({ - 'soiling_ratio_median': all_annual_iwsr.quantile(0.5, axis=1), - 'soiling_ratio_low': all_annual_iwsr.quantile( - 0.5 - confidence_level/2/100, axis=1), - 'soiling_ratio_high': all_annual_iwsr.quantile( - 0.5 + confidence_level/2/100, axis=1), - }) - annual_soiling.index.name = 'year' + all_profiles_weighted.index.year + ).sum() + all_annual_iwsr = all_annual_weighted_sums.multiply(1 / annual_insolation, axis=0) + + annual_soiling = pd.DataFrame( + {"soiling_ratio_median": all_annual_iwsr.quantile(0.5, axis=1), + "soiling_ratio_low": all_annual_iwsr.quantile(0.5 - confidence_level / 2 / 100, axis=1), + "soiling_ratio_high": all_annual_iwsr.quantile(0.5 + confidence_level / 2 / 100, axis=1), + }) + annual_soiling.index.name = "year" annual_soiling = annual_soiling.reset_index() return annual_soiling def monthly_soiling_rates(soiling_interval_summary, min_interval_length=14, - max_relative_slope_error=500.0, reps=100000, - confidence_level=68.2): - ''' + max_relative_slope_error=500.0, reps=100000, confidence_level=68.2): + """ Use Monte Carlo to calculate typical monthly soiling rates. Samples possible soiling rates from soiling rate confidence intervals associated with soiling intervals assuming a uniform @@ -1002,75 +1338,67 @@ def monthly_soiling_rates(soiling_interval_summary, min_interval_length=14, | | intervals contribute, the confidence interval | | | is likely to underestimate the true uncertainty. | +-----------------------+--------------------------------------------------+ - ''' + """ # filter to intervals of interest - high = soiling_interval_summary['soiling_rate_high'] - low = soiling_interval_summary['soiling_rate_low'] - rate = soiling_interval_summary['soiling_rate'] + high = soiling_interval_summary["soiling_rate_high"] + low = soiling_interval_summary["soiling_rate_low"] + rate = soiling_interval_summary["soiling_rate"] rel_error = 100 * abs((high - low) / rate) intervals = soiling_interval_summary[ - (soiling_interval_summary['length'] >= min_interval_length) & - (soiling_interval_summary['valid']) & - (rel_error <= max_relative_slope_error) - ].copy() + (soiling_interval_summary["length"] >= min_interval_length) + & (soiling_interval_summary["valid"]) & (rel_error <= max_relative_slope_error)].copy() # count the overlap of each interval with each month month_counts = [] for _, row in intervals.iterrows(): - month_counts.append(_count_month_days(row['start'], row['end'])) + month_counts.append(_count_month_days(row["start"], row["end"])) # divy up the monte carlo reps based on overlap for month in range(1, 13): days_in_month = np.array([x[month] for x in month_counts]) - sample_col = f'samples_for_month_{month}' + sample_col = f"samples_for_month_{month}" if days_in_month.sum() > 0: - intervals[sample_col] = np.ceil( - days_in_month / days_in_month.sum() * reps) + intervals[sample_col] = np.ceil(days_in_month / days_in_month.sum() * reps) else: intervals[sample_col] = 0 intervals[sample_col] = intervals[sample_col].astype(int) # perform the monte carlo month by month - ci_quantiles = [0.5 - confidence_level/2/100, 0.5 + confidence_level/2/100] + ci_quantiles = [0.5 - confidence_level / 2 / 100, 0.5 + confidence_level / 2 / 100] monthly_rate_data = [] relevant_interval_count = [] for month in range(1, 13): rates = [] - sample_col = f'samples_for_month_{month}' + sample_col = f"samples_for_month_{month}" relevant_intervals = intervals[intervals[sample_col] > 0] for _, row in relevant_intervals.iterrows(): rates.append(np.random.uniform( - row['soiling_rate_low'], - row['soiling_rate_high'], - row[sample_col])) + row["soiling_rate_low"], row["soiling_rate_high"], row[sample_col])) rates = [x for sublist in rates for x in sublist] if rates: - monthly_rate_data.append(np.quantile(rates, - [0.5, ci_quantiles[0], - ci_quantiles[1]])) + monthly_rate_data.append(np.quantile(rates, [0.5, ci_quantiles[0], ci_quantiles[1]])) else: - monthly_rate_data.append(np.array([np.nan]*3)) + monthly_rate_data.append(np.array([np.nan] * 3)) relevant_interval_count.append(len(relevant_intervals)) monthly_rate_data = np.array(monthly_rate_data) # make a dataframe out of the results - monthly_soiling_df = pd.DataFrame(data=monthly_rate_data, - columns=['soiling_rate_median', - 'soiling_rate_low', - 'soiling_rate_high']) - monthly_soiling_df.insert(0, 'month', range(1, 13)) - monthly_soiling_df['interval_count'] = relevant_interval_count + monthly_soiling_df = pd.DataFrame( + data=monthly_rate_data, + columns=["soiling_rate_median", "soiling_rate_low", "soiling_rate_high"]) + monthly_soiling_df.insert(0, "month", range(1, 13)) + monthly_soiling_df["interval_count"] = relevant_interval_count return monthly_soiling_df -class CODSAnalysis(): - ''' +class CODSAnalysis: + """ Container for the Combined Degradation and Soiling (CODS) algorithm for degradation and soiling loss analysis. Based on the method presented in [1]_. @@ -1166,7 +1494,7 @@ class CODSAnalysis(): ---------- .. [1] Skomedal, Å. and Deceglie, M. G., IEEE Journal of Photovoltaics, Sept. 2020. https://doi.org/10.1109/JPHOTOV.2020.3018219 - ''' + """ def __init__(self, energy_normalized_daily): self.pm = energy_normalized_daily # daily performance metric @@ -1175,18 +1503,17 @@ def __init__(self, energy_normalized_daily): first_keeper = self.pm.isna().idxmin() self.pm = self.pm.loc[first_keeper:] - if self.pm.index.freq != 'D': - raise ValueError('Daily performance metric series must have ' - 'daily frequency (missing dates should be ' - 'represented by NaNs)') + if self.pm.index.freq != "D": + raise ValueError("Daily performance metric series must have " + "daily frequency (missing dates should be " + "represented by NaNs)") def iterative_signal_decomposition( - self, order=('SR', 'SC', 'Rd'), degradation_method='YoY', - max_iterations=18, cleaning_sensitivity=.5, convergence_criterion=5e-3, - pruning_iterations=1, clean_pruning_sensitivity=.6, soiling_significance=.75, - process_noise=1e-4, renormalize_SR=None, ffill=True, clip_soiling=True, - verbose=False): - ''' + self, order=("SR", "SC", "Rd"), degradation_method="YoY", max_iterations=18, + cleaning_sensitivity=0.5, convergence_criterion=5e-3, pruning_iterations=1, + clean_pruning_sensitivity=0.6, soiling_significance=0.75, process_noise=1e-4, + renormalize_SR=None, ffill=True, clip_soiling=True, verbose=False): + """ Estimates the soiling losses and the degradation rate of a PV system based on its daily normalized energy, or daily Performance Index (PI). The underlying assumption is that the PI @@ -1325,14 +1652,14 @@ def iterative_signal_decomposition( .. [3] Skomedal, Å. and Deceglie, M. G., IEEE Journal of Photovoltaics, Sept. 2020. https://doi.org/10.1109/JPHOTOV.2020.3018219 - ''' + """ pi = self.pm.copy() - if degradation_method == 'STL' and 'Rd' in order: - order = tuple([c for c in order if c != 'Rd']) + if degradation_method == "STL" and "Rd" in order: + order = tuple([c for c in order if c != "Rd"]) - if 'SR' not in order: - raise ValueError('\'SR\' must be in argument \'order\' ' + - '(e.g. order=[\'SR\', \'SC\', \'Rd\']') + if "SR" not in order: + raise ValueError( + "'SR' must be in argument 'order' " + "(e.g. order=['SR', 'SC', 'Rd']") n_steps = len(order) day = np.arange(len(pi)) degradation_trend = [1] @@ -1345,143 +1672,125 @@ def iterative_signal_decomposition( convergence_metric = [_RMSE(pi, np.ones((len(pi),)))] # Find possible cleaning events based on the performance index - ce, rm9 = _rolling_median_ce_detection(pi.index, pi, ffill=ffill, - tuner=cleaning_sensitivity) + ce, rm9 = _rolling_median_ce_detection( + pi.index, pi, ffill=ffill, tuner=cleaning_sensitivity) pce = _collapse_cleaning_events(ce, rm9.diff().values, 5) small_soiling_signal, perfect_cleaning = False, True ic = 0 # iteration counter if verbose: - print('It. nr\tstep\tRMSE\ttimer') + print("It. nr\tstep\tRMSE\ttimer") if verbose: - print('{:}\t- \t{:.5f}'.format(ic, convergence_metric[ic])) + print("{:}\t- \t{:.5f}".format(ic, convergence_metric[ic])) while ic < max_iterations: t0 = time.time() ic += 1 # Find soiling component - if order[(ic-1) % n_steps] == 'SR': + if order[(ic - 1) % n_steps] == "SR": if ic > 2: # Add possible cleaning events found by considering # the residuals pce = soiling_dfs[-1].cleaning_events.copy() cleaning_sensitivity *= 1.2 # decrease sensitivity ce, rm9 = _rolling_median_ce_detection( - pi.index, residuals, ffill=ffill, - tuner=cleaning_sensitivity) + pi.index, residuals, ffill=ffill, tuner=cleaning_sensitivity) ce = _collapse_cleaning_events(ce, rm9.diff().values, 5) pce[ce] = True clean_pruning_sensitivity /= 1.1 # increase pruning sensitivity # Decompose input signal - soiling_dummy = (pi / - degradation_trend[-1] / - seasonal_component[-1] / - residual_shift) + soiling_dummy = ( + pi / degradation_trend[-1] / seasonal_component[-1] / residual_shift) # Run Kalman Filter for obtaining soiling component kdf, Ps = self._Kalman_filter_for_SR( - zs_series=soiling_dummy, - clip_soiling=clip_soiling, - prescient_cleaning_events=pce, - pruning_iterations=pruning_iterations, + zs_series=soiling_dummy, clip_soiling=clip_soiling, + prescient_cleaning_events=pce, pruning_iterations=pruning_iterations, clean_pruning_sensitivity=clean_pruning_sensitivity, - perfect_cleaning=perfect_cleaning, - process_noise=process_noise, + perfect_cleaning=perfect_cleaning, process_noise=process_noise, renormalize_SR=renormalize_SR) soiling_ratio.append(kdf.soiling_ratio) soiling_dfs.append(kdf) # Find seasonal component - if order[(ic-1) % n_steps] == 'SC': + if order[(ic - 1) % n_steps] == "SC": season_dummy = pi / soiling_ratio[-1] # Decompose signal if season_dummy.isna().sum() > 0: - season_dummy.interpolate('linear', inplace=True) + season_dummy.interpolate("linear", inplace=True) season_dummy = season_dummy.apply(np.log) # Log transform # Run STL model - STL_res = STL(season_dummy, period=365, seasonal=999999, - seasonal_deg=0, trend_deg=0, - robust=True, low_pass_jump=30, seasonal_jump=30, - trend_jump=365).fit() + STL_res = STL( + season_dummy, period=365, seasonal=999999, seasonal_deg=0, trend_deg=0, + robust=True, low_pass_jump=30, seasonal_jump=30, trend_jump=365).fit() # Smooth result - smooth_season = lowess(STL_res.seasonal.apply(np.exp), - pi.index, is_sorted=True, delta=30, - frac=180/len(pi), return_sorted=False) + smooth_season = lowess( + STL_res.seasonal.apply(np.exp), pi.index, is_sorted=True, delta=30, + frac=180 / len(pi), return_sorted=False) # Ensure periodic seaonal component - seasonal_comp = _force_periodicity(smooth_season, - season_dummy.index, - pi.index) + seasonal_comp = _force_periodicity(smooth_season, season_dummy.index, pi.index) seasonal_component.append(seasonal_comp) - if degradation_method == 'STL': # If not YoY - deg_trend = pd.Series(index=pi.index, - data=STL_res.trend.apply(np.exp)) + if degradation_method == "STL": # If not YoY + deg_trend = pd.Series(index=pi.index, data=STL_res.trend.apply(np.exp)) degradation_trend.append(deg_trend / deg_trend.iloc[0]) yoy_save.append(RdToolsDeg.degradation_year_on_year( - degradation_trend[-1], uncertainty_method=None)) + degradation_trend[-1], uncertainty_method=None)) # Find degradation component - if order[(ic-1) % n_steps] == 'Rd': + if order[(ic - 1) % n_steps] == "Rd": # Decompose signal - trend_dummy = (pi / - seasonal_component[-1] / - soiling_ratio[-1]) + trend_dummy = pi / seasonal_component[-1] / soiling_ratio[-1] # Run YoY - yoy = RdToolsDeg.degradation_year_on_year( - trend_dummy, uncertainty_method=None) + yoy = RdToolsDeg.degradation_year_on_year(trend_dummy, uncertainty_method=None) # Convert degradation rate to trend - degradation_trend.append(pd.Series( - index=pi.index, data=(1 + day * yoy / 100 / 365.0))) + degradation_trend.append( + pd.Series(index=pi.index, data=(1 + day * yoy / 100 / 365.0))) yoy_save.append(yoy) # Combine and calculate residual flatness - total_model = (degradation_trend[-1] * - seasonal_component[-1] * - soiling_ratio[-1]) + total_model = degradation_trend[-1] * seasonal_component[-1] * soiling_ratio[-1] residuals = pi / total_model residual_shift = residuals.mean() total_model *= residual_shift convergence_metric.append(_RMSE(pi, total_model)) if verbose: - print('{:}\t{:}\t{:.5f}\t\t\t{:.1f} s'.format( - ic, order[(ic-1) % n_steps], convergence_metric[-1], - time.time()-t0)) + print("{:}\t{:}\t{:.5f}\t\t\t{:.1f} s".format( + ic, order[(ic - 1) % n_steps], convergence_metric[-1], time.time() - t0)) # Convergence happens if there is no improvement in RMSE from one # step to the next if ic >= n_steps: - relative_improvement = ((convergence_metric[-n_steps-1] - - convergence_metric[-1]) / - convergence_metric[-n_steps-1]) - if perfect_cleaning and ( - ic >= max_iterations / 2 or - relative_improvement < convergence_criterion): + relative_improvement = (convergence_metric[-n_steps - 1] - convergence_metric[-1] + ) / convergence_metric[-n_steps - 1] + if perfect_cleaning and (ic >= max_iterations / 2 or + relative_improvement < convergence_criterion): # From now on, do not assume perfect cleaning perfect_cleaning = False # Reorder to ensure SR first - order = tuple([order[(i+n_steps-1-(ic-1) % n_steps) % n_steps] - for i in range(n_steps)]) + order = tuple( + [order[(i + n_steps - 1 - (ic - 1) % n_steps) % n_steps] + for i in range(n_steps)]) change_point = ic if verbose: - print('Now not assuming perfect cleaning') - elif (not perfect_cleaning and - (ic >= max_iterations or - (ic >= change_point + n_steps and - relative_improvement < - convergence_criterion))): + print("Now not assuming perfect cleaning") + elif not perfect_cleaning and ( + ic >= max_iterations + or (ic >= change_point + n_steps + and relative_improvement < convergence_criterion)): if verbose: if relative_improvement < convergence_criterion: - print('Convergence reached.') + print("Convergence reached.") else: - print('Max iterations reached.') + print("Max iterations reached.") ic = max_iterations # Initialize output DataFrame - df_out = pd.DataFrame(index=pi.index, - columns=['soiling_ratio', 'soiling_rates', - 'cleaning_events', 'seasonal_component', - 'degradation_trend', 'total_model', - 'residuals']) + df_out = pd.DataFrame( + index=pi.index, + columns=[ + "soiling_ratio", "soiling_rates", "cleaning_events", "seasonal_component", + "degradation_trend", "total_model", "residuals"]) # Save values df_out.seasonal_component = seasonal_component[-1] @@ -1496,39 +1805,33 @@ def iterative_signal_decomposition( soiling_loss = (1 - df_out.soiling_ratio).mean() * 100 # Total model - df_out.total_model = (df_out.soiling_ratio * - df_out.seasonal_component * - df_out.degradation_trend) + df_out.total_model = ( + df_out.soiling_ratio * df_out.seasonal_component * df_out.degradation_trend) df_out.residuals = pi / df_out.total_model residual_shift = df_out.residuals.mean() df_out.total_model *= residual_shift RMSE = _RMSE(pi, df_out.total_model) - adf_res = adfuller(df_out.residuals.dropna(), regression='ctt', autolag=None) + adf_res = adfuller(df_out.residuals.dropna(), regression="ctt", autolag=None) if verbose: - print('p-value for the H0 that there is a unit root in the' + - 'residuals (using the Augmented Dickey-fuller test):' + - '{:.3e}'.format(adf_res[1])) + print("p-value for the H0 that there is a unit root in the" + + "residuals (using the Augmented Dickey-fuller test):" + + "{:.3e}".format(adf_res[1])) # Check size of soiling signal vs residuals - SR_amp = float(np.diff(df_out.soiling_ratio.quantile([.1, .9]))) - residuals_amp = float(np.diff(df_out.residuals.quantile([.1, .9]))) + SR_amp = float(np.diff(df_out.soiling_ratio.quantile([0.1, 0.9]))) + residuals_amp = float(np.diff(df_out.residuals.quantile([0.1, 0.9]))) soiling_signal_strength = SR_amp / residuals_amp if soiling_signal_strength < soiling_significance: if verbose: - print('Soiling signal is small relative to the noise') + print("Soiling signal is small relative to the noise") small_soiling_signal = True df_out.SR_high = 1.0 df_out.SR_low = 1.0 - SR_amp # Set up results dictionary - results_dict = dict( - degradation=degradation, - soiling_loss=soiling_loss, - residual_shift=residual_shift, - RMSE=RMSE, - small_soiling_signal=small_soiling_signal, - adf_res=adf_res - ) + results_dict = dict(degradation=degradation, soiling_loss=soiling_loss, + residual_shift=residual_shift, RMSE=RMSE, + small_soiling_signal=small_soiling_signal, adf_res=adf_res) return df_out, results_dict @@ -1545,7 +1848,7 @@ def run_bootstrap(self, verbose=False, bootstrap_seed=None, **kwargs): - ''' + """ Bootstrapping of CODS algorithm for uncertainty analysis, inherently accounting for model and parameter choices. @@ -1668,7 +1971,7 @@ def run_bootstrap(self, ---------- .. [1] Skomedal, Å. and Deceglie, M. G., IEEE Journal of Photovoltaics, Sept. 2020. https://doi.org/10.1109/JPHOTOV.2020.3018219 - ''' + """ pi = self.pm.copy() # ###################### # @@ -1676,14 +1979,13 @@ def run_bootstrap(self, # ###################### # # Generate combinations of model parameter alternatives - parameter_alternatives = [order_alternatives, - cleaning_sensitivity_alternatives, - clean_pruning_sensitivity_alternatives, - forward_fill_alternatives] + parameter_alternatives = [ + order_alternatives, cleaning_sensitivity_alternatives, + clean_pruning_sensitivity_alternatives, forward_fill_alternatives] index_list = list(itertools.product([0, 1], repeat=len(parameter_alternatives))) - combination_of_parameters = [[parameter_alternatives[j][indexes[j]] - for j in range(len(parameter_alternatives))] - for indexes in index_list] + combination_of_parameters = [ + [parameter_alternatives[j][indexes[j]] for j in range(len(parameter_alternatives))] + for indexes in index_list] nr_models = len(index_list) bootstrap_samples_list, list_of_df_out, results = [], [], [] @@ -1692,24 +1994,24 @@ def run_bootstrap(self, reps += nr_models - reps % nr_models if verbose: - print('Initially fitting {:} models'.format(nr_models)) + print("Initially fitting {:} models".format(nr_models)) t00 = time.time() # For each combination of model parameter alternatives, fit one model: for c, (order, dt, pt, ff) in enumerate(combination_of_parameters): try: df_out, result_dict = self.iterative_signal_decomposition( - max_iterations=18, order=order, clip_soiling=True, - cleaning_sensitivity=dt, pruning_iterations=1, - clean_pruning_sensitivity=pt, process_noise=process_noise, ffill=ff, - degradation_method=degradation_method, **kwargs) + max_iterations=18, order=order, clip_soiling=True, cleaning_sensitivity=dt, + pruning_iterations=1, clean_pruning_sensitivity=pt, + process_noise=process_noise, ffill=ff, degradation_method=degradation_method, + **kwargs) # Save results list_of_df_out.append(df_out) results.append(result_dict) - adf = result_dict['adf_res'] + adf = result_dict["adf_res"] # If we can reject the null-hypothesis that there is a unit # root in the residuals: - if adf[1] < .05: + if adf[1] < 0.05: # ... generate bootstrap samples based on the fit: bootstrap_samples_list.append( _make_time_series_bootstrap_samples( @@ -1719,42 +2021,38 @@ def run_bootstrap(self, # Print progress if verbose: - _progressBarWithETA(c+1, nr_models, time.time()-t00, - bar_length=30) + _progressBarWithETA(c + 1, nr_models, time.time() - t00, bar_length=30) except ValueError as ex: print(ex) # Revive results - adfs = np.array([(r['adf_res'][0] if r['adf_res'][1] < 0.05 else 0) for r in results]) - RMSEs = np.array([r['RMSE'] for r in results]) - SR_is_one_fraction = np.array( - [(df.soiling_ratio == 1).mean() for df in list_of_df_out]) - small_soiling_signal = [r['small_soiling_signal'] for r in results] + adfs = np.array([(r["adf_res"][0] if r["adf_res"][1] < 0.05 else 0) for r in results]) + RMSEs = np.array([r["RMSE"] for r in results]) + SR_is_one_fraction = np.array([(df.soiling_ratio == 1).mean() for df in list_of_df_out]) + small_soiling_signal = [r["small_soiling_signal"] for r in results] # Calculate weights weights = 1 / RMSEs / (1 + SR_is_one_fraction) weights /= np.sum(weights) # Save sensitivities and weights for initial model fits - _parameters_n_weights = pd.concat([pd.DataFrame(combination_of_parameters), - pd.Series(RMSEs), - pd.Series(SR_is_one_fraction), - pd.Series(weights), - pd.Series(small_soiling_signal)], - axis=1, ignore_index=True) + _parameters_n_weights = pd.concat( + [pd.DataFrame(combination_of_parameters), pd.Series(RMSEs), + pd.Series(SR_is_one_fraction), pd.Series(weights), pd.Series(small_soiling_signal)], + axis=1, ignore_index=True) if verbose: # Print summary - _parameters_n_weights.columns = ['order', 'dt', 'pt', 'ff', 'RMSE', - 'SR==1', 'weights', 'small_soiling_signal'] + _parameters_n_weights.columns = [ + "order", "dt", "pt", "ff", "RMSE", "SR==1", "weights", "small_soiling_signal"] if verbose: - print('\n', _parameters_n_weights) + print("\n", _parameters_n_weights) # Check if data is decomposable if np.sum(adfs == 0) > nr_models / 2: raise RuntimeError( - 'Test for stationary residuals (Augmented Dickey-Fuller' - + ' test) not passed in half of the instances:\nData not' - + ' decomposable.') + "Test for stationary residuals (Augmented Dickey-Fuller" + + " test) not passed in half of the instances:\nData not" + + " decomposable.") # Save best model self.initial_fits = [df for df in list_of_df_out] @@ -1764,83 +2062,76 @@ def run_bootstrap(self, # don't do bootstrapping if np.sum(small_soiling_signal) > nr_models / 2: self.result_df = result_df - self.residual_shift = results[np.argmax(weights)]['residual_shift'] + self.residual_shift = results[np.argmax(weights)]["residual_shift"] YOY = RdToolsDeg.degradation_year_on_year(pi) self.degradation = [YOY[0], YOY[1][0], YOY[1][1]] self.soiling_loss = [0, 0, (1 - result_df.soiling_ratio).mean()] self.small_soiling_signal = True self.errors = ( - 'Soiling signal is small relative to the noise. ' - 'Iterative decomposition not possible. ' - 'Degradation found by RdTools YoY.') + "Soiling signal is small relative to the noise. " + "Iterative decomposition not possible. " + "Degradation found by RdTools YoY." + ) warnings.warn(self.errors) return self.result_df, self.degradation, self.soiling_loss self.small_soiling_signal = False # Aggregate all bootstrap samples - all_bootstrap_samples = pd.concat(bootstrap_samples_list, axis=1, - ignore_index=True) + all_bootstrap_samples = pd.concat(bootstrap_samples_list, axis=1, ignore_index=True) # Seasonal samples are generated from previously fitted seasonal # components, by perturbing amplitude and phase shift # Number of samples per fit: sample_nr = int(reps / nr_models) - list_of_SCs = [list_of_df_out[m].seasonal_component - for m in range(nr_models) if weights[m] > 0] - seasonal_samples = _make_seasonal_samples(list_of_SCs, - sample_nr=sample_nr, - min_multiplier=.8, - max_multiplier=1.75, - max_shift=30) + list_of_SCs = [ + list_of_df_out[m].seasonal_component for m in range(nr_models) if weights[m] > 0] + seasonal_samples = _make_seasonal_samples( + list_of_SCs, sample_nr=sample_nr, min_multiplier=0.8, max_multiplier=1.75, + max_shift=30) # ###################### # # ###### STAGE 2 ####### # # ###################### # if verbose and reps > 0: - print('\nBootstrapping for uncertainty analysis', - '({:} realizations):'.format(reps)) - order = ('SR', 'SC' if degradation_method == 'STL' else 'Rd') + print("\nBootstrapping for uncertainty analysis", + "({:} realizations):".format(reps)) + order = ("SR", "SC" if degradation_method == "STL" else "Rd") t0 = time.time() - bt_kdfs, bt_SL, bt_deg, parameters, adfs, RMSEs, SR_is_1, rss, errors = \ - [], [], [], [], [], [], [], [], ['Bootstrapping errors'] + bt_kdfs, bt_SL, bt_deg, parameters, adfs, RMSEs, SR_is_1, rss, errors = ( + [], [], [], [], [], [], [], [], ["Bootstrapping errors"]) for b in range(reps): try: # randomly choose model sensitivities - dt = np.random.uniform(parameter_alternatives[1][0], - parameter_alternatives[1][-1]) - pt = np.random.uniform(parameter_alternatives[2][0], - parameter_alternatives[2][-1]) + dt = np.random.uniform(parameter_alternatives[1][0], parameter_alternatives[1][-1]) + pt = np.random.uniform(parameter_alternatives[2][0], parameter_alternatives[2][-1]) pn = np.random.uniform(process_noise / 1.5, process_noise * 1.5) - renormalize_SR = np.random.choice([None, - np.random.uniform(.5, .95)]) + renormalize_SR = np.random.choice([None, np.random.uniform(0.5, 0.95)]) ffill = np.random.choice([True, False]) parameters.append([dt, pt, pn, renormalize_SR, ffill]) # Sample to infer soiling from - bootstrap_sample = \ - all_bootstrap_samples[b] / seasonal_samples[b] + bootstrap_sample = all_bootstrap_samples[b] / seasonal_samples[b] # Set up a temprary instance of the CODSAnalysis object temporary_cods_instance = CODSAnalysis(bootstrap_sample) # Do Signal decomposition for soiling and degradation component kdf, results_dict = temporary_cods_instance.iterative_signal_decomposition( - max_iterations=4, order=order, clip_soiling=True, - cleaning_sensitivity=dt, pruning_iterations=1, - clean_pruning_sensitivity=pt, process_noise=pn, - renormalize_SR=renormalize_SR, ffill=ffill, - degradation_method=degradation_method, **kwargs) + max_iterations=4, order=order, clip_soiling=True, cleaning_sensitivity=dt, + pruning_iterations=1, clean_pruning_sensitivity=pt, process_noise=pn, + renormalize_SR=renormalize_SR, ffill=ffill, + degradation_method=degradation_method, **kwargs) # If we can reject the null-hypothesis that there is a unit # root in the residuals: - if results_dict['adf_res'][1] < .05: # Save the results + if results_dict["adf_res"][1] < 0.05: # Save the results bt_kdfs.append(kdf) - adfs.append(results_dict['adf_res'][0]) - RMSEs.append(results_dict['RMSE']) - bt_deg.append(results_dict['degradation']) - bt_SL.append(results_dict['soiling_loss']) - rss.append(results_dict['residual_shift']) + adfs.append(results_dict["adf_res"][0]) + RMSEs.append(results_dict["RMSE"]) + bt_deg.append(results_dict["degradation"]) + bt_SL.append(results_dict["soiling_loss"]) + rss.append(results_dict["residual_shift"]) SR_is_1.append((kdf.soiling_ratio == 1).mean()) else: seasonal_samples.drop(columns=[b], inplace=True) @@ -1851,20 +2142,16 @@ def run_bootstrap(self, # Print progress if verbose: - _progressBarWithETA(b+1, reps, time.time()-t0, bar_length=30) + _progressBarWithETA(b + 1, reps, time.time() - t0, bar_length=30) # Reweight and save weights weights = 1 / np.array(RMSEs) / (1 + np.array(SR_is_1)) weights /= np.sum(weights) self._parameters_n_weights = pd.concat( - [pd.DataFrame(parameters), - pd.Series(RMSEs), - pd.Series(adfs), - pd.Series(SR_is_1), - pd.Series(weights)], - axis=1, ignore_index=True) - self._parameters_n_weights.columns = ['dt', 'pt', 'pn', 'RSR', 'ffill', - 'RMSE', 'ADF', 'SR==1', 'weights'] + [pd.DataFrame(parameters), pd.Series(RMSEs), pd.Series(adfs), + pd.Series(SR_is_1), pd.Series(weights)], axis=1, ignore_index=True) + self._parameters_n_weights.columns = [ + "dt", "pt", "pn", "RSR", "ffill", "RMSE", "ADF", "SR==1", "weights"] # ###################### # # ###### STAGE 3 ####### # @@ -1881,68 +2168,64 @@ def run_bootstrap(self, concat_ce = pd.concat([kdf.cleaning_events for kdf in bt_kdfs], axis=1) # Find confidence intervals for SR and soiling rates - df_out['SR_low'] = concat_SR.quantile(ci_low_edge, 1) - df_out['SR_high'] = concat_SR.quantile(ci_high_edge, 1) - df_out['rates_low'] = concat_r_s.quantile(ci_low_edge, 1) - df_out['rates_high'] = concat_r_s.quantile(ci_high_edge, 1) + df_out["SR_low"] = concat_SR.quantile(ci_low_edge, 1) + df_out["SR_high"] = concat_SR.quantile(ci_high_edge, 1) + df_out["rates_low"] = concat_r_s.quantile(ci_low_edge, 1) + df_out["rates_high"] = concat_r_s.quantile(ci_high_edge, 1) # Save best estimate and bootstrapped estimates of SR and soiling rates df_out.soiling_ratio = df_out.soiling_ratio.clip(lower=0, upper=1) - df_out.loc[df_out.soiling_ratio.diff() == 0, 'soiling_rates'] = 0 - df_out['bt_soiling_ratio'] = (concat_SR * weights).sum(1) - df_out['bt_soiling_rates'] = (concat_r_s * weights).sum(1) + df_out.loc[df_out.soiling_ratio.diff() == 0, "soiling_rates"] = 0 + df_out["bt_soiling_ratio"] = (concat_SR * weights).sum(1) + df_out["bt_soiling_rates"] = (concat_r_s * weights).sum(1) # Set probability of cleaning events df_out.cleaning_events = (concat_ce * weights).sum(1) # Find degradation rates - self.degradation = [np.dot(bt_deg, weights), - np.quantile(bt_deg, ci_low_edge), - np.quantile(bt_deg, ci_high_edge)] - df_out.degradation_trend = 1 + np.arange(len(pi)) * \ - self.degradation[0] / 100 / 365.0 + self.degradation = [ + np.dot(bt_deg, weights), np.quantile(bt_deg, ci_low_edge), + np.quantile(bt_deg, ci_high_edge)] + df_out.degradation_trend = 1 + np.arange(len(pi)) * self.degradation[0] / 100 / 365.0 # Soiling losses - self.soiling_loss = [np.dot(bt_SL, weights), - np.quantile(bt_SL, ci_low_edge), - np.quantile(bt_SL, ci_high_edge)] + self.soiling_loss = [ + np.dot(bt_SL, weights), np.quantile(bt_SL, ci_low_edge), + np.quantile(bt_SL, ci_high_edge)] # Save "confidence intervals" for seasonal component df_out.seasonal_component = (seasonal_samples * weights).sum(1) - df_out['seasonal_low'] = seasonal_samples.quantile(ci_low_edge, 1) - df_out['seasonal_high'] = seasonal_samples.quantile(ci_high_edge, 1) + df_out["seasonal_low"] = seasonal_samples.quantile(ci_low_edge, 1) + df_out["seasonal_high"] = seasonal_samples.quantile(ci_high_edge, 1) # Total model with confidence intervals - df_out.total_model = (df_out.degradation_trend * - df_out.seasonal_component * - df_out.soiling_ratio) - df_out['model_low'] = concat_tot_mod.quantile(ci_low_edge, 1) - df_out['model_high'] = concat_tot_mod.quantile(ci_high_edge, 1) + df_out.total_model = ( + df_out.degradation_trend * df_out.seasonal_component * df_out.soiling_ratio) + df_out["model_low"] = concat_tot_mod.quantile(ci_low_edge, 1) + df_out["model_high"] = concat_tot_mod.quantile(ci_high_edge, 1) # Residuals and residual shift df_out.residuals = pi / df_out.total_model self.residual_shift = df_out.residuals.mean() df_out.total_model *= self.residual_shift self.RMSE = _RMSE(pi, df_out.total_model) - self.adf_results = adfuller(df_out.residuals.dropna(), - regression='ctt', autolag=None) + self.adf_results = adfuller(df_out.residuals.dropna(), regression="ctt", autolag=None) self.result_df = df_out self.errors = errors if verbose: - print('\nFinal RMSE: {:.5f}'.format(self.RMSE)) + print("\nFinal RMSE: {:.5f}".format(self.RMSE)) if len(self.errors) > 1: print(self.errors) return self.result_df, self.degradation, self.soiling_loss - def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, - rate_std=.005, max_soiling_rates=.0005, - pruning_iterations=1, clean_pruning_sensitivity=.6, - renormalize_SR=None, perfect_cleaning=False, - prescient_cleaning_events=None, - clip_soiling=True, ffill=True): - ''' + def _Kalman_filter_for_SR( + self, zs_series, process_noise=1e-4, zs_std=0.05, rate_std=0.005, + max_soiling_rates=0.0005, pruning_iterations=1, clean_pruning_sensitivity=0.6, + renormalize_SR=None, perfect_cleaning=False, prescient_cleaning_events=None, + clip_soiling=True, ffill=True): + """ A function for estimating the underlying Soiling Ratio (SR) and the rate of change of the SR (the soiling rate), based on a noisy time series of daily (corrected) normalized energy using a Kalman Filter (KF). See @@ -2013,39 +2296,36 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, References ---------- .. [1] R. R. Labbe, Kalman and Bayesian Filters in Python. 2016. - ''' + """ # Ensure numeric index zs_series = zs_series.copy() # Make copy, so as not to change input zs_series = zs_series.astype(float) original_index = zs_series.index.copy() - if (original_index.dtype not in [int, 'int64']): + if original_index.dtype not in [int, "int64"]: zs_series.index = range(len(zs_series)) # Check prescient_cleaning_events. If not present, find cleaning events if isinstance(prescient_cleaning_events, list): cleaning_events = prescient_cleaning_events else: - if (isinstance(prescient_cleaning_events, type(zs_series)) and - (prescient_cleaning_events.sum() > 4)): + if isinstance(prescient_cleaning_events, type(zs_series)) and ( + prescient_cleaning_events.sum() > 4): if len(prescient_cleaning_events) == len(zs_series): prescient_cleaning_events = prescient_cleaning_events.copy() prescient_cleaning_events.index = zs_series.index else: raise ValueError( - "The indices of prescient_cleaning_events must correspond to the" + - " indices of zs_series; they must be of the same length") + "The indices of prescient_cleaning_events must correspond to the" + + " indices of zs_series; they must be of the same length") else: # If no prescient cleaning events, detect cleaning events - ce, rm9 = _rolling_median_ce_detection( - zs_series.index, zs_series, tuner=0.5) - prescient_cleaning_events = \ - _collapse_cleaning_events(ce, rm9.diff().values, 5) + ce, rm9 = _rolling_median_ce_detection(zs_series.index, zs_series, tuner=0.5) + prescient_cleaning_events = _collapse_cleaning_events(ce, rm9.diff().values, 5) cleaning_events = prescient_cleaning_events[prescient_cleaning_events].index.tolist() # Find soiling events (e.g. dust storms) - soiling_events = _soiling_event_detection( - zs_series.index, zs_series, ffill=ffill, tuner=5) + soiling_events = _soiling_event_detection(zs_series.index, zs_series, ffill=ffill, tuner=5) soiling_events = soiling_events[soiling_events].index.tolist() # Initialize various parameters @@ -2066,17 +2346,17 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, dt = 1 # All time stemps are one day # Initialize Kalman filter - f = self._initialize_univariate_model(zs_series, dt, process_noise, - measurement_noise, rate_std, - zs_std, initial_slope) + f = self._initialize_univariate_model( + zs_series, dt, process_noise, measurement_noise, rate_std, zs_std, initial_slope) # Initialize miscallenous variables - dfk = pd.DataFrame(index=zs_series.index, dtype=float, - columns=['raw_pi', 'raw_rates', 'smooth_pi', - 'smooth_rates', 'soiling_ratio', - 'soiling_rates', 'cleaning_events', - 'days_since_ce']) - dfk['cleaning_events'] = False + dfk = pd.DataFrame( + index=zs_series.index, + dtype=float, + columns=[ + "raw_pi", "raw_rates", "smooth_pi", "smooth_rates", "soiling_ratio", + "soiling_rates", "cleaning_events", "days_since_ce"]) + dfk["cleaning_events"] = False # Kalman Filter part: ####################################################################### @@ -2086,8 +2366,7 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, # Save results and smooth with rts smoother dfk, Xs, Ps = self._smooth_results( - dfk, f, Xs, Ps, zs_series, cleaning_events, soiling_events, - perfect_cleaning) + dfk, f, Xs, Ps, zs_series, cleaning_events, soiling_events, perfect_cleaning) ####################################################################### # Some steps to clean up the soiling data: @@ -2100,34 +2379,28 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, rm_smooth_pi = dfk.smooth_pi.rolling(7).median().shift(-6) pi_after_cleaning = rm_smooth_pi.loc[cleaning_events] # Detect outiers/false positives - false_positives = _find_numeric_outliers(pi_after_cleaning, - clean_pruning_sensitivity, 'lower') - cleaning_events = \ - false_positives[~false_positives].index.tolist() + false_positives = _find_numeric_outliers( + pi_after_cleaning, clean_pruning_sensitivity, "lower") + cleaning_events = false_positives[~false_positives].index.tolist() # 2: Remove longer periods with positive (soiling) rates if (dfk.smooth_rates > max_soiling_rates).sum() > 1: exceeding_rates = dfk.smooth_rates > max_soiling_rates new_cleaning_events = _collapse_cleaning_events( exceeding_rates, dfk.smooth_rates, 4) - cleaning_events.extend( - new_cleaning_events[new_cleaning_events].index) + cleaning_events.extend(new_cleaning_events[new_cleaning_events].index) cleaning_events.sort() # 3: If the list of cleaning events has changed, run the Kalman # Filter and smoother again if not ce_0 == cleaning_events: - f = self._initialize_univariate_model(zs_series, dt, - process_noise, - measurement_noise, - rate_std, zs_std, - initial_slope) + f = self._initialize_univariate_model( + zs_series, dt, process_noise, measurement_noise, rate_std, zs_std, + initial_slope) Xs, Ps, rate_std, zs_std = self._forward_pass( - f, zs_series, rolling_median_7, cleaning_events, - soiling_events) + f, zs_series, rolling_median_7, cleaning_events, soiling_events) dfk, Xs, Ps = self._smooth_results( - dfk, f, Xs, Ps, zs_series, cleaning_events, - soiling_events, perfect_cleaning) + dfk, f, Xs, Ps, zs_series, cleaning_events, soiling_events, perfect_cleaning) else: counter = 100 # Make sure the while loop stops @@ -2136,14 +2409,13 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, if perfect_cleaning: # SR = 1 after cleaning events if len(cleaning_events) > 0: pi_dummy = pd.Series(index=dfk.index, data=np.nan) - pi_dummy.loc[cleaning_events] = \ - dfk.smooth_pi.loc[cleaning_events] + pi_dummy.loc[cleaning_events] = dfk.smooth_pi.loc[cleaning_events] dfk.soiling_ratio = 1 / pi_dummy.ffill() * dfk.smooth_pi # Set the SR in the first soiling period based on the mean # ratio of the Kalman estimate (smooth_pi) and the SR - dfk.loc[:cleaning_events[0], 'soiling_ratio'] = \ - dfk.loc[:cleaning_events[0], 'smooth_pi'] \ - * (dfk.soiling_ratio / dfk.smooth_pi).mean() + dfk.loc[: cleaning_events[0], "soiling_ratio"] = ( + dfk.loc[: cleaning_events[0], "smooth_pi"] + * (dfk.soiling_ratio / dfk.smooth_pi).mean()) else: # If no cleaning events dfk.soiling_ratio = 1 else: # Otherwise, if the inut signal has been decomposed, and @@ -2151,40 +2423,37 @@ def _Kalman_filter_for_SR(self, zs_series, process_noise=1e-4, zs_std=.05, dfk.soiling_ratio = dfk.smooth_pi # 5: Renormalize Soiling Ratio if renormalize_SR is not None: - dfk.soiling_ratio /= dfk.loc[cleaning_events, 'soiling_ratio' - ].quantile(renormalize_SR) + dfk.soiling_ratio /= dfk.loc[cleaning_events, "soiling_ratio"].quantile( + renormalize_SR) # 6: Force soiling ratio to not exceed 1: if clip_soiling: dfk['soiling_ratio'] = dfk['soiling_ratio'].clip(upper=1) dfk.soiling_rates = dfk.smooth_rates - dfk.loc[dfk.soiling_ratio.diff() == 0, 'soiling_rates'] = 0 + dfk.loc[dfk.soiling_ratio.diff() == 0, "soiling_rates"] = 0 # Set number of days since cleaning event nr_days_dummy = pd.Series(index=dfk.index, data=np.nan) - nr_days_dummy.loc[cleaning_events] = [int(date-dfk.index[0]) - for date in cleaning_events] + nr_days_dummy.loc[cleaning_events] = [int(date - dfk.index[0]) for date in cleaning_events] nr_days_dummy.iloc[0] = 0 dfk.days_since_ce = range(len(zs_series)) - nr_days_dummy.ffill() # Save cleaning events and soiling events - dfk.loc[cleaning_events, 'cleaning_events'] = True + dfk.loc[cleaning_events, "cleaning_events"] = True dfk.index = original_index # Set index back to orignial index return dfk, Ps - def _forward_pass(self, f, zs_series, rolling_median_7, cleaning_events, - soiling_events): - ''' Run the forward pass of the Kalman Filter algortihm ''' + def _forward_pass(self, f, zs_series, rolling_median_7, cleaning_events, soiling_events): + """Run the forward pass of the Kalman Filter algortihm""" zs = zs_series.values N = len(zs) Xs, Ps = np.zeros((N, 2)), np.zeros((N, 2, 2)) # Enter forward pass of filtering algorithm for i, z in enumerate(zs): - if 7 < i < N-7 and (i in cleaning_events or i in soiling_events): - rolling_median_local = rolling_median_7.loc[i-5:i+5].values - u = self._set_control_input(f, rolling_median_local, i, - cleaning_events) + if 7 < i < N - 7 and (i in cleaning_events or i in soiling_events): + rolling_median_local = rolling_median_7.loc[i - 5 : i + 5].values + u = self._set_control_input(f, rolling_median_local, i, cleaning_events) f.predict(u=u) # Predict wth control input u else: # If no cleaning detection, predict without control input f.predict() @@ -2196,49 +2465,47 @@ def _forward_pass(self, f, zs_series, rolling_median_7, cleaning_events, rate_std, zs_std = Ps[-1, 1, 1], Ps[-1, 0, 0] return Xs, Ps, rate_std, zs_std # Convert to numpy and return - def _set_control_input(self, f, rolling_median_local, index, - cleaning_events): - ''' + def _set_control_input(self, f, rolling_median_local, index, cleaning_events): + """ For each cleaning event, sets control input u based on current Kalman Filter state estimate (f.x), and the median value for the following week. If the cleaning event seems to be misplaced, moves the cleaning event to a more sensible location. If the cleaning event seems to be correct, removes other cleaning events in the 10 days surrounding this day - ''' + """ u = np.zeros(f.x.shape) # u is the control input window_size = 11 # len of rolling_median_local HW = 5 # Half window moving_diff = np.diff(rolling_median_local) # Index of maximum change in rolling median max_diff_index = moving_diff.argmax() - if max_diff_index == HW-1 or index not in cleaning_events: + if max_diff_index == HW - 1 or index not in cleaning_events: # The median zs of the week after the cleaning event - z_med = rolling_median_local[HW+3] + z_med = rolling_median_local[HW + 3] # Set control input this future median u[0] = z_med - np.dot(f.H, np.dot(f.F, f.x)) # If the change is bigger than the measurement noise: - if np.abs(u[0]) > np.sqrt(f.R)/2: - index_dummy = [n+3 for n in range(window_size-HW-1) - if n+3 != HW] - cleaning_events = [ce for ce in cleaning_events - if ce-index+HW not in index_dummy] + if np.abs(u[0]) > np.sqrt(f.R) / 2: + index_dummy = [n + 3 for n in range(window_size - HW - 1) if n + 3 != HW] + cleaning_events = [ + ce for ce in cleaning_events if ce - index + HW not in index_dummy] else: # If the cleaning event is insignificant u[0] = 0 if index in cleaning_events: cleaning_events.remove(index) else: # If the index with the maximum difference is not today... cleaning_events.remove(index) # ...remove today from the list - if moving_diff[max_diff_index] > 0 \ - and index+max_diff_index-HW+1 not in cleaning_events: + if (moving_diff[max_diff_index] > 0 + and index + max_diff_index - HW + 1 not in cleaning_events): # ...and add the missing day - bisect.insort(cleaning_events, index+max_diff_index-HW+1) + bisect.insort(cleaning_events, index + max_diff_index - HW + 1) return u - def _smooth_results(self, dfk, f, Xs, Ps, zs_series, cleaning_events, - soiling_events, perfect_cleaning): - ''' Smoother for Kalman Filter estimates. Smooths the Kalaman estimate - between given cleaning events and saves all in DataFrame dfk''' + def _smooth_results( + self, dfk, f, Xs, Ps, zs_series, cleaning_events, soiling_events, perfect_cleaning): + """Smoother for Kalman Filter estimates. Smooths the Kalaman estimate + between given cleaning events and saves all in DataFrame dfk""" # Save unsmoothed estimates dfk.raw_pi = Xs[:, 0] dfk.raw_rates = Xs[:, 1] @@ -2253,8 +2520,7 @@ def _smooth_results(self, dfk, f, Xs, Ps, zs_series, cleaning_events, # Smooth between cleaning events for start, end in zip(ce_dummy[:-1], ce_dummy[1:]): num_ind = df_num_ind.loc[start:end].iloc[:-1] - Xs[num_ind], Ps[num_ind], _, _ = f.rts_smoother(Xs[num_ind], - Ps[num_ind]) + Xs[num_ind], Ps[num_ind], _, _ = f.rts_smoother(Xs[num_ind], Ps[num_ind]) # Save smoothed estimates dfk.smooth_pi = Xs[:, 0] @@ -2262,17 +2528,15 @@ def _smooth_results(self, dfk, f, Xs, Ps, zs_series, cleaning_events, return dfk, Xs, Ps - def _initialize_univariate_model(self, zs_series, dt, process_noise, - measurement_noise, rate_std, zs_std, - initial_slope): - ''' Initializes the univariate Kalman Filter model, using the filterpy - package ''' + def _initialize_univariate_model( + self, zs_series, dt, process_noise, measurement_noise, + rate_std, zs_std, initial_slope): + """Initializes the univariate Kalman Filter model, using the filterpy + package""" f = KalmanFilter(dim_x=2, dim_z=1) - f.F = np.array([[1., dt], - [0., 1.]]) - f.H = np.array([[1., 0.]]) - f.P = np.array([[zs_std**2, 0], - [0, rate_std**2]]) + f.F = np.array([[1.0, dt], [0.0, 1.0]]) + f.H = np.array([[1.0, 0.0]]) + f.P = np.array([[zs_std**2, 0], [0, rate_std**2]]) f.Q = Q_discrete_white_noise(dim=2, dt=dt, var=process_noise**2) f.x = np.array([initial_slope[1], initial_slope[0]]) f.B = np.zeros(f.F.shape) @@ -2281,19 +2545,14 @@ def _initialize_univariate_model(self, zs_series, dt, process_noise, return f -def soiling_cods(energy_normalized_daily, - reps=512, - confidence_level=68.2, - degradation_method='YoY', - process_noise=1e-4, - order_alternatives=(('SR', 'SC', 'Rd'), - ('SC', 'SR', 'Rd')), - cleaning_sensitivity_alternatives=(.25, .75), - clean_pruning_sensitivity_alternatives=(1/1.5, 1.5), - forward_fill_alternatives=(True, False), - verbose=False, - **kwargs): - ''' +def soiling_cods( + energy_normalized_daily, reps=512, confidence_level=68.2, degradation_method="YoY", + process_noise=1e-4, order_alternatives=( + ("SR", "SC", "Rd"), ("SC", "SR", "Rd")), + cleaning_sensitivity_alternatives=(0.25, 0.75), + clean_pruning_sensitivity_alternatives=(1 / 1.5, 1.5), + forward_fill_alternatives=(True, False), verbose=False, **kwargs): + """ Functional wrapper for :py:class:`~rdtools.soiling.CODSAnalysis` and its subroutine :py:func:`~rdtools.soiling.CODSAnalysis.run_bootstrap`. Runs the combined degradation and soiling (CODS) algorithm with bootstrapping. @@ -2406,31 +2665,26 @@ def soiling_cods(energy_normalized_daily, ---------- .. [1] Skomedal, Å. and Deceglie, M. G., IEEE Journal of Photovoltaics, Sept. 2020. https://doi.org/10.1109/JPHOTOV.2020.3018219 - ''' + """ CODS = CODSAnalysis(energy_normalized_daily) CODS.run_bootstrap( - reps=reps, - confidence_level=confidence_level, - verbose=verbose, - degradation_method=degradation_method, - process_noise=process_noise, + reps=reps, confidence_level=confidence_level, verbose=verbose, + degradation_method=degradation_method, process_noise=process_noise, order_alternatives=order_alternatives, cleaning_sensitivity_alternatives=cleaning_sensitivity_alternatives, clean_pruning_sensitivity_alternatives=clean_pruning_sensitivity_alternatives, - forward_fill_alternatives=forward_fill_alternatives, - **kwargs) + forward_fill_alternatives=forward_fill_alternatives, **kwargs) sr = 1 - CODS.soiling_loss[0] / 100 sr_ci = 1 - np.array(CODS.soiling_loss[1:3]) / 100 - return sr, sr_ci, CODS.degradation[0], np.array(CODS.degradation[1:3]), \ - CODS.result_df + return (sr, sr_ci, CODS.degradation[0], np.array(CODS.degradation[1:3]), CODS.result_df) def _collapse_cleaning_events(inferred_ce_in, metric, f=4): - ''' A function for replacing quick successive cleaning events with one + """A function for replacing quick successive cleaning events with one (most probable) cleaning event. Parameters @@ -2447,10 +2701,9 @@ def _collapse_cleaning_events(inferred_ce_in, metric, f=4): ------- inferred_ce : pandas.Series boolean values for cleaning events - ''' + """ # Ensure numeric index - if isinstance(inferred_ce_in.index, - pd.core.indexes.datetimes.DatetimeIndex): + if isinstance(inferred_ce_in.index, pd.core.indexes.datetimes.DatetimeIndex): saveindex = inferred_ce_in.copy().index inferred_ce_in.index = range(len(saveindex)) else: @@ -2470,11 +2723,10 @@ def _collapse_cleaning_events(inferred_ce_in, metric, f=4): end_true_vals = collapsed_ce_dummy.loc[start_true_vals:].idxmin() - 1 if end_true_vals >= start_true_vals: # If the island ends # Find the day with mac probability of being a cleaning event - max_diff_day = \ - metric.loc[start_true_vals-f:end_true_vals+f].idxmax() + max_diff_day = metric.loc[start_true_vals - f : end_true_vals + f].idxmax() # Set all days in this period as false - collapsed_ce.loc[start_true_vals-f:end_true_vals+f] = False - collapsed_ce_dummy.loc[start_true_vals-f:end_true_vals+f] = False + collapsed_ce.loc[start_true_vals - f : end_true_vals + f] = False + collapsed_ce_dummy.loc[start_true_vals - f : end_true_vals + f] = False # Set the max probability day as True (cleaning event) collapsed_ce.loc[max_diff_day] = True # Find the next island of true values @@ -2488,51 +2740,52 @@ def _collapse_cleaning_events(inferred_ce_in, metric, f=4): def _rolling_median_ce_detection(x, y, ffill=True, rolling_window=9, tuner=1.5): - ''' Finds cleaning events in a time series of performance index (y) ''' + """Finds cleaning events in a time series of performance index (y)""" y = pd.Series(index=x, data=y) y = y.astype(float) if ffill: # forward fill NaNs in y before running mean rm = y.ffill().rolling(rolling_window, center=True).median() else: # ... or backfill instead rm = y.bfill().rolling(rolling_window, center=True).median() - Q3 = rm.diff().abs().quantile(.75) - Q1 = rm.diff().abs().quantile(.25) + Q3 = rm.diff().abs().quantile(0.75) + Q1 = rm.diff().abs().quantile(0.25) limit = Q3 + tuner * (Q3 - Q1) cleaning_events = rm.diff() > limit return cleaning_events, rm def _soiling_event_detection(x, y, ffill=True, tuner=5): - ''' Finds cleaning events in a time series of performance index (y) ''' + """Finds cleaning events in a time series of performance index (y)""" y = pd.Series(index=x, data=y) y = y.astype(float) if ffill: # forward fill NaNs in y before running mean rm = y.ffill().rolling(9, center=True).median() else: # ... or backfill instead rm = y.bfill().rolling(9, center=True).median() - Q3 = rm.diff().abs().quantile(.99) - Q1 = rm.diff().abs().quantile(.01) + Q3 = rm.diff().abs().quantile(0.99) + Q1 = rm.diff().abs().quantile(0.01) limit = Q1 - tuner * (Q3 - Q1) soiling_events = rm.diff() < limit return soiling_events -def _make_seasonal_samples(list_of_SCs, sample_nr=10, min_multiplier=0.5, - max_multiplier=2, max_shift=20): - ''' Generate seasonal samples by perturbing the amplitude and the phase of - a seasonal components found with the fitted CODS model ''' - samples = pd.DataFrame(index=list_of_SCs[0].index, - columns=range(int(sample_nr*len(list_of_SCs))), - dtype=float) +def _make_seasonal_samples( + list_of_SCs, sample_nr=10, min_multiplier=0.5, max_multiplier=2, max_shift=20): + """Generate seasonal samples by perturbing the amplitude and the phase of + a seasonal components found with the fitted CODS model""" + samples = pd.DataFrame( + index=list_of_SCs[0].index, + columns=range(int(sample_nr * len(list_of_SCs))), + dtype=float) # From each fitted signal, we will generate new seaonal components for i, signal in enumerate(list_of_SCs): # Remove beginning and end of signal signal_mean = signal.mean() # Make a signal matrix where each column is a year and each row a date - year_matrix = signal.rename('values').to_frame().assign( - doy=signal.index.dayofyear, - year=signal.index.year - ).pivot(index='doy', columns='year', values='values') + year_matrix = ( + signal.rename("values").to_frame() + .assign(doy=signal.index.dayofyear, year=signal.index.year) + .pivot(index="doy", columns="year", values="values")) # We will use the median signal through all the years... median_signal = year_matrix.median(1) for j in range(sample_nr): @@ -2543,25 +2796,21 @@ def _make_seasonal_samples(list_of_SCs, sample_nr=10, min_multiplier=0.5, # constructing the new signal based on median_signal shifted_signal = pd.Series( index=signal.index, - data=median_signal.reindex( - (signal.index.dayofyear-shift) % 365 + 1).values) + data=median_signal.reindex((signal.index.dayofyear - shift) % 365 + 1).values) # Perturb amplitude by recentering to 0 multiplying by multiplier - samples.loc[:, i*sample_nr + j] = \ - multiplier * (shifted_signal - signal_mean) + 1 + samples.loc[:, i * sample_nr + j] = multiplier * (shifted_signal - signal_mean) + 1 return samples def _force_periodicity(in_signal, signal_index, out_index): - ''' Function for forcing periodicity in a seasonal component signal ''' + """Function for forcing periodicity in a seasonal component signal""" # Make sure the in_signal is a Series if isinstance(in_signal, np.ndarray): - signal = pd.Series(index=pd.DatetimeIndex(signal_index.date), - data=in_signal) + signal = pd.Series(index=pd.DatetimeIndex(signal_index.date), data=in_signal) elif isinstance(in_signal, pd.Series): - signal = pd.Series(index=pd.DatetimeIndex(signal_index.date), - data=in_signal.values) + signal = pd.Series(index=pd.DatetimeIndex(signal_index.date), data=in_signal.values) else: - raise ValueError('in_signal must be numpy array or pandas Series') + raise ValueError("in_signal must be numpy array or pandas Series") # Make sure that we don't remove too much of the data: remove_length = np.min([180, int((len(signal) - 365) / 2)]) @@ -2573,66 +2822,157 @@ def _force_periodicity(in_signal, signal_index, out_index): # Make a signal matrix where each column is a year and each row is a date year_matrix = pd.DataFrame(index=np.arange(0, 365), columns=unique_years) for year in unique_years: - dates_in_year = pd.date_range(str(year)+'-01-01', str(year)+'-12-31') + dates_in_year = pd.date_range(str(year) + "-01-01", str(year) + "-12-31") # We cut off the extra day(s) of leap years - year_matrix[year] = \ - signal.loc[str(year)].reindex(dates_in_year).values[:365] + year_matrix[year] = signal.loc[str(year)].reindex(dates_in_year).values[:365] # We will use the median signal through all the years... median_signal = year_matrix.median(1) # The output is the median signal broadcasted to the whole time series - output = pd.Series( - index=out_index, - data=median_signal.reindex(out_index.dayofyear - 1).values) + output = pd.Series(index=out_index, data=median_signal.reindex(out_index.dayofyear - 1).values) return output -def _find_numeric_outliers(x, multiplier=1.5, where='both', verbose=False): - ''' Function for finding numeric outliers ''' +def _find_numeric_outliers(x, multiplier=1.5, where="both", verbose=False): + """Function for finding numeric outliers""" try: # Calulate third and first quartile - Q3 = np.quantile(x, .75) - Q1 = np.quantile(x, .25) + Q3 = np.quantile(x, 0.75) + Q1 = np.quantile(x, 0.25) except IndexError as ie: print(ie, x) except RuntimeWarning as rw: print(rw, x) IQR = Q3 - Q1 # Interquartile range - if where == 'upper': # If detecting upper outliers + if where == "upper": # If detecting upper outliers if verbose: - print('Upper limit', Q3 + multiplier * IQR) - return (x > Q3 + multiplier * IQR) - elif where == 'lower': # If detecting lower outliers + print("Upper limit", Q3 + multiplier * IQR) + return x > Q3 + multiplier * IQR + elif where == "lower": # If detecting lower outliers if verbose: - print('Lower limit', Q1 - multiplier * IQR) - return (x < Q1 - multiplier * IQR) - elif where == 'both': # If detecting both lower and upper outliers + print("Lower limit", Q1 - multiplier * IQR) + return x < Q1 - multiplier * IQR + elif where == "both": # If detecting both lower and upper outliers if verbose: - print('Upper, lower limit', - Q3 + multiplier * IQR, - Q1 - multiplier * IQR) + print("Upper, lower limit", Q3 + multiplier * IQR, Q1 - multiplier * IQR) return (x > Q3 + multiplier * IQR), (x < Q1 - multiplier * IQR) def _RMSE(y_true, y_pred): - '''Calculates the Root Mean Squared Error for y_true and y_pred, where - y_pred is the "prediction", and y_true is the truth.''' - mask = ~np.isnan(y_pred) - return np.sqrt(np.mean((y_pred[mask]-y_true[mask])**2)) + """Calculates the Root Mean Squared Error for y_true and y_pred, where + y_pred is the "prediction", and y_true is the truth.""" + mask = ~pd.isnull(y_pred) + return np.sqrt(np.mean((y_pred[mask] - y_true[mask]) ** 2)) def _MSD(y_true, y_pred): - '''Calculates the Mean Signed Deviation for y_true and y_pred, where y_pred - is the "prediction", and y_true is the truth.''' + """Calculates the Mean Signed Deviation for y_true and y_pred, where y_pred + is the "prediction", and y_true is the truth.""" return np.mean(y_pred - y_true) def _progressBarWithETA(value, endvalue, time, bar_length=20): - ''' Prints a progressbar with an estimated time of "arrival" ''' + """Prints a progressbar with an estimated time of "arrival" """ percent = float(value) / endvalue * 100 - arrow = '-' * int(round(percent/100 * bar_length)-1) + '>' - spaces = ' ' * (bar_length - len(arrow)) + arrow = "-" * int(round(percent / 100 * bar_length) - 1) + ">" + spaces = " " * (bar_length - len(arrow)) used = time / 60 # Time Used - left = used / percent*(100-percent) # Estimated time left + left = used / percent * (100 - percent) # Estimated time left sys.stdout.write( - "\r# {:} | Used: {:.1f} min | Left: {:.1f}".format(value, used, left) + - " min | Progress: [{:}] {:.0f} %".format(arrow + spaces, percent)) + "\r# {:} | Used: {:.1f} min | Left: {:.1f}".format(value, used, left) + + " min | Progress: [{:}] {:.0f} %".format(arrow + spaces, percent)) sys.stdout.flush() + + +############################################################################### +# all code below for new piecewise fitting in soiling intervals within srr/Matt +############################################################################### +def piecewise_linear(x, x0, b, k1, k2): + cond_list = [x < x0, x >= x0] + func_list = [lambda x: k1 * x + b, lambda x: k1 * x + b + k2 * (x - x0)] + return np.piecewise(x, cond_list, func_list) + + +def segmented_soiling_period( + pr, fill_method="bfill", days_clean_vs_cp=7, initial_guesses=[13, 1, 0, 0], + bounds=None, min_r2=0.15): + # note min_r2 was 0.6 and it could be worth testing 10 day forward median as b guess + """ + Applies segmented regression to a single deposition period + (data points in between two cleaning events). + Segmentation is neglected if change point occurs within a number of days + (days_clean_vs_cp) of the cleanings. + + Parameters + ---------- + pr : + Series of daily performance ratios measured during the given deposition period. + fill_method : str (default='bfill') + Method to employ to fill any missing day. + days_clean_vs_cp : numeric (default=7) + Minimum number of days accepted between cleanings and change points. + bounds : numeric (default=None) + List of bounds for fitting function. If not specified, they are + defined in the function. + initial_guesses : numeric (default=0.1) + List of initial guesses for fitting function + min_r2 : numeric (default=0.1) + Minimum R2 to consider valid the extracted soiling profile. + + Returns + ------- + sr: numeric + Series containing the daily soiling ratio values after segmentation. + List of nan if segmentation was not possible. + cp_date: datetime + Datetime in which continuous change points occurred. + None if segmentation was not possible. + """ + # Check if PR dataframe has datetime index + if not isinstance(pr.index, pd.DatetimeIndex): + raise ValueError("The time series does not have DatetimeIndex") + + # Define bounds if not provided + if bounds is None: + # bounds are neg in first 4 and pos in second 4 + # ordered as x0,b,k1,k2 where x0 is the breakpoint k1 and k2 are slopes + bounds = [(13, -5, -np.inf, -np.inf), ((len(pr) - 13), 5, +np.inf, +np.inf)] + y = pr.values + x = np.arange(0.0, len(y)) + + try: + # Fit soiling profile with segmentation + p, e = curve_fit(piecewise_linear, x, y, p0=initial_guesses, bounds=bounds) + + # Ignore change point if too close to a cleaning + # Change point p[0] converted to integer to extract a date. + # None if no change point is found. + if p[0] > days_clean_vs_cp and p[0] < len(y) - days_clean_vs_cp: + z = piecewise_linear(x, *p) + cp_date = int(p[0]) + else: + z = [np.nan] * len(x) + cp_date = None + R2_original = st.linregress(y, x)[2] ** 2 + R2_piecewise = st.linregress(y, z)[2] ** 2 + + R2_improve = R2_piecewise - R2_original + R2_percent_improve = (R2_piecewise / R2_original) - 1 + R2_percent_of_possible_improve = R2_improve / (1 - R2_original) + # improvement relative to possible improvement + + if len(y) < 45: # tighter requirements for shorter soiling periods + if (R2_piecewise < min_r2) | ( + (R2_percent_of_possible_improve < 0.5) & (R2_percent_improve < 0.5)): + z = [np.nan] * len(x) + cp_date = None + else: + if (R2_percent_improve < 0.01) | (R2_piecewise < 0.4): + z = [np.nan] * len(x) + cp_date = None + except ValueError as ex: + print(f"Segmentation was not possible. Error: {ex}") + z = [np.nan] * len(x) + cp_date = None + # Create Series from modelled profile + sr = pd.Series(z, index=pr.index) + + return sr, cp_date diff --git a/rdtools/test/conftest.py b/rdtools/test/conftest.py index b940f2df..4085b997 100644 --- a/rdtools/test/conftest.py +++ b/rdtools/test/conftest.py @@ -54,6 +54,41 @@ def soiling_normalized_daily(soiling_times): return normalized_daily +@pytest.fixture() +def soiling_normalized_daily_with_neg_shifts(soiling_times): + interval_1_v1 = 1 - 0.005 * np.arange(0, 15, 1) + interval_1_v2 = (0.9 - 0.005 * 15) - 0.005 * np.arange(0, 10, 1) + interval_2 = 1 - 0.002 * np.arange(0, 25, 1) + interval_3_v1 = 1 - 0.001 * np.arange(0, 10, 1) + interval_3_v2 = (0.95 - 0.001 * 10) - 0.001 * np.arange(0, 15, 1) + profile = np.concatenate( + (interval_1_v1, interval_1_v2, interval_2, interval_3_v1, interval_3_v2) + ) + np.random.seed(1977) + noise = 0.01 * np.random.rand(75) + normalized_daily = pd.Series(data=profile, index=soiling_times) + normalized_daily = normalized_daily + noise + + return normalized_daily + + +@pytest.fixture() +def soiling_normalized_daily_with_piecewise_slope(soiling_times): + interval_1_v1 = 1 - 0.002 * np.arange(0, 20, 1) + interval_1_v2 = (1 - 0.002 * 20) - 0.007 * np.arange(0, 20, 1) + interval_2_v1 = 1 - 0.01 * np.arange(0, 20, 1) + interval_2_v2 = (1 - 0.01 * 20) - 0.001 * np.arange(0, 15, 1) + profile = np.concatenate( + (interval_1_v1, interval_1_v2, interval_2_v1, interval_2_v2) + ) + np.random.seed(1977) + noise = 0.01 * np.random.rand(75) + normalized_daily = pd.Series(data=profile, index=soiling_times) + normalized_daily = normalized_daily + noise + + return normalized_daily + + @pytest.fixture() def soiling_insolation(soiling_times): insolation = np.empty((75,)) diff --git a/rdtools/test/soiling_test.py b/rdtools/test/soiling_test.py index a1a67837..7f87cf2d 100644 --- a/rdtools/test/soiling_test.py +++ b/rdtools/test/soiling_test.py @@ -5,6 +5,7 @@ from rdtools.soiling import annual_soiling_ratios from rdtools.soiling import monthly_soiling_rates from rdtools.soiling import NoValidIntervalError +from rdtools.soiling import segmented_soiling_period import pytest @@ -13,86 +14,94 @@ def test_soiling_srr(soiling_normalized_daily, soiling_insolation, soiling_times reps = 10 np.random.seed(1977) sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=reps) + assert 0.964369 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different from expected value' + "Soiling ratio different from expected value" assert np.array([0.962540, 0.965295]) == pytest.approx(sr_ci, abs=1e-6), \ - 'Confidence interval different from expected value' - assert 0.960205 == pytest.approx(soiling_info['exceedance_level'], abs=1e-6), \ - 'Exceedance level different from expected value' - assert 0.984079 == pytest.approx(soiling_info['renormalizing_factor'], abs=1e-6), \ - 'Renormalizing factor different from expected value' - assert len(soiling_info['stochastic_soiling_profiles']) == reps, \ + "Confidence interval different from expected value" + assert 0.960205 == pytest.approx(soiling_info["exceedance_level"], abs=1e-6), \ + "Exceedance level different from expected value" + assert 0.984079 == pytest.approx(soiling_info["renormalizing_factor"], abs=1e-6), \ + "Renormalizing factor different from expected value" + assert (len(soiling_info["stochastic_soiling_profiles"]) == reps), \ 'Length of soiling_info["stochastic_soiling_profiles"] different than expected' - assert isinstance(soiling_info['stochastic_soiling_profiles'], list), \ + assert isinstance(soiling_info["stochastic_soiling_profiles"], list), \ 'soiling_info["stochastic_soiling_profiles"] is not a list' # Check soiling_info['soiling_interval_summary'] - expected_summary_columns = ['start', 'end', 'soiling_rate', 'soiling_rate_low', - 'soiling_rate_high', 'inferred_start_loss', 'inferred_end_loss', - 'length', 'valid'] - actual_summary_columns = soiling_info['soiling_interval_summary'].columns.values + expected_summary_columns = [ + "start", "end", "soiling_rate", "soiling_rate_low", "soiling_rate_high", + "inferred_start_loss", "inferred_end_loss", "inferred_recovery", + "inferred_begin_shift", "length", "valid"] + actual_summary_columns = soiling_info["soiling_interval_summary"].columns.values for x in actual_summary_columns: - assert x in expected_summary_columns, \ + assert (x in expected_summary_columns), \ f"'{x}' not an expected column in soiling_info['soiling_interval_summary']" for x in expected_summary_columns: - assert x in actual_summary_columns, \ + assert (x in actual_summary_columns), \ f"'{x}' was expected as a column, but not in soiling_info['soiling_interval_summary']" - assert isinstance(soiling_info['soiling_interval_summary'], pd.DataFrame), \ + + assert isinstance(soiling_info["soiling_interval_summary"], pd.DataFrame), \ 'soiling_info["soiling_interval_summary"] not a dataframe' - expected_means = pd.Series({'soiling_rate': -0.002644544, - 'soiling_rate_low': -0.002847504, - 'soiling_rate_high': -0.002455915, - 'inferred_start_loss': 1.020124, - 'inferred_end_loss': 0.9566552, - 'length': 24.0, - 'valid': 1.0}) - expected_means = expected_means[['soiling_rate', 'soiling_rate_low', 'soiling_rate_high', - 'inferred_start_loss', 'inferred_end_loss', - 'length', 'valid']] - actual_means = soiling_info['soiling_interval_summary'][expected_means.index].mean() + + expected_means = pd.Series( + {"soiling_rate": -0.002644544, "soiling_rate_low": -0.002847504, + "soiling_rate_high": -0.002455915, "inferred_start_loss": 1.020124, + "inferred_end_loss": 0.9566552, "inferred_recovery": 0.065416, + "inferred_begin_shift": 0.084814, "length": 24.0, "valid": 1.0}) + expected_means = expected_means[ + ["soiling_rate", "soiling_rate_low", "soiling_rate_high", "inferred_start_loss", + "inferred_end_loss", "inferred_recovery", "inferred_begin_shift", "length", "valid"]] + actual_means = soiling_info["soiling_interval_summary"][expected_means.index].mean() pd.testing.assert_series_equal(expected_means, actual_means, check_exact=False) # Check soiling_info['soiling_ratio_perfect_clean'] - pd.testing.assert_index_equal(soiling_info['soiling_ratio_perfect_clean'].index, soiling_times, - check_names=False) - sr_mean = soiling_info['soiling_ratio_perfect_clean'].mean() - assert 0.968265 == pytest.approx(sr_mean, abs=1e-6), \ - "The mean of soiling_info['soiling_ratio_perfect_clean'] differs from expected" - assert isinstance(soiling_info['soiling_ratio_perfect_clean'], pd.Series), \ - 'soiling_info["soiling_ratio_perfect_clean"] not a pandas series' + pd.testing.assert_index_equal( + soiling_info["soiling_ratio_perfect_clean"].index, soiling_times, check_names=False) + sr_mean = soiling_info["soiling_ratio_perfect_clean"].mean() + assert 0.968265 == pytest.approx( + sr_mean, abs=1e-6 + ), "The mean of soiling_info['soiling_ratio_perfect_clean'] differs from expected" + assert isinstance( + soiling_info["soiling_ratio_perfect_clean"], pd.Series + ), 'soiling_info["soiling_ratio_perfect_clean"] not a pandas series' @pytest.mark.filterwarnings("ignore:.*20% or more of the daily data.*:UserWarning") -@pytest.mark.parametrize('method,expected_sr', - [('random_clean', 0.936177), - ('half_norm_clean', 0.915093), - ('perfect_clean', 0.977116)]) -def test_soiling_srr_consecutive_invalid(soiling_normalized_daily, soiling_insolation, - soiling_times, method, expected_sr): +@pytest.mark.parametrize( + "method,neg_shift,piecewise,expected_sr", + [("random_clean", False, False, 0.936177), + ("half_norm_clean", False, False, 0.915093), + ("perfect_clean", False, False, 0.977116), + ("perfect_clean_complex", True, True, 0.977116), + ("inferred_clean_complex", True, True, 0.975805)]) +def test_soiling_srr_consecutive_invalid( + soiling_normalized_daily, soiling_insolation, soiling_times, + method, neg_shift, piecewise, expected_sr): reps = 10 np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=reps, - max_relative_slope_error=20.0, method=method) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, + reps=reps, max_relative_slope_error=20.0, + method=method, piecewise=piecewise, + neg_shift=neg_shift) assert expected_sr == pytest.approx(sr, abs=1e-6), \ - f'Soiling ratio different from expected value for {method} with consecutive invalid intervals' # noqa: E501 + f"Soiling ratio different from expected value for {method} \ + with consecutive invalid intervals" -@pytest.mark.parametrize('clean_criterion,expected_sr', - [('precip_and_shift', 0.982546), - ('precip_or_shift', 0.973433), - ('precip', 0.976196), - ('shift', 0.964369)]) -def test_soiling_srr_with_precip(soiling_normalized_daily, soiling_insolation, soiling_times, - clean_criterion, expected_sr): +@pytest.mark.parametrize("clean_criterion,expected_sr", + [("precip_and_shift", 0.982546), + ("precip_or_shift", 0.973433), + ("precip", 0.976196), + ("shift", 0.964369)]) +def test_soiling_srr_with_precip(soiling_normalized_daily, soiling_insolation, + soiling_times, clean_criterion, expected_sr): precip = pd.Series(index=soiling_times, data=0) - precip['2019-01-18 00:00:00-07:00'] = 1 - precip['2019-02-20 00:00:00-07:00'] = 1 + precip["2019-01-18 00:00:00-07:00"] = 1 + precip["2019-02-20 00:00:00-07:00"] = 1 - kwargs = { - 'reps': 10, - 'precipitation_daily': precip - } + kwargs = {"reps": 10, "precipitation_daily": precip} np.random.seed(1977) sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, clean_criterion=clean_criterion, **kwargs) @@ -101,18 +110,18 @@ def test_soiling_srr_with_precip(soiling_normalized_daily, soiling_insolation, s def test_soiling_srr_confidence_levels(soiling_normalized_daily, soiling_insolation): - 'Tests SRR with different confidence level settingsf from above' + "Tests SRR with different confidence level settings from above" np.random.seed(1977) sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, confidence_level=95, reps=10, exceedance_prob=80.0) assert np.array([0.959322, 0.966066]) == pytest.approx(sr_ci, abs=1e-6), \ - 'Confidence interval with confidence_level=95 different than expected' - assert 0.962691 == pytest.approx(soiling_info['exceedance_level'], abs=1e-6), \ + "Confidence interval with confidence_level=95 different than expected" + assert 0.962691 == pytest.approx(soiling_info["exceedance_level"], abs=1e-6), \ 'soiling_info["exceedance_level"] different than expected when exceedance_prob=80' def test_soiling_srr_dayscale(soiling_normalized_daily, soiling_insolation): - 'Test that a long dayscale can prevent valid intervals from being found' + "Test that a long dayscale can prevent valid intervals from being found" with pytest.raises(NoValidIntervalError): np.random.seed(1977) sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, @@ -120,45 +129,51 @@ def test_soiling_srr_dayscale(soiling_normalized_daily, soiling_insolation): def test_soiling_srr_clean_threshold(soiling_normalized_daily, soiling_insolation): - '''Test that clean test_soiling_srr_clean_threshold works with a float and - can cause no soiling intervals to be found''' + """Test that clean test_soiling_srr_clean_threshold works with a float and + can cause no soiling intervals to be found""" np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10, - clean_threshold=0.01) + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=10, clean_threshold=0.01) assert 0.964369 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio with specified clean_threshold different from expected value' + "Soiling ratio with specified clean_threshold different from expected value" with pytest.raises(NoValidIntervalError): np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, - reps=10, clean_threshold=0.1) + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=10, clean_threshold=0.1) def test_soiling_srr_trim(soiling_normalized_daily, soiling_insolation): np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10, - trim=True) + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=10, trim=True) assert 0.978093 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio with trim=True different from expected value' - assert len(soiling_info['soiling_interval_summary']) == 1, \ - 'Wrong number of soiling intervals found with trim=True' - - -@pytest.mark.parametrize('method,expected_sr', - [('random_clean', 0.920444), - ('perfect_clean', 0.966912) - ]) -def test_soiling_srr_method(soiling_normalized_daily, soiling_insolation, method, expected_sr): + "Soiling ratio with trim=True different from expected value" + assert (len(soiling_info["soiling_interval_summary"]) == 1), \ + "Wrong number of soiling intervals found with trim=True" + + +@pytest.mark.parametrize( + "method,neg_shift,piecewise,expected_sr", + [("random_clean", False, False, 0.920444), + ("perfect_clean", False, False, 0.966912), + ("perfect_clean_complex", True, True, 0.966912), + ("inferred_clean_complex", True, True, 0.965565)]) +def test_soiling_srr_method( + soiling_normalized_daily, soiling_insolation, method, neg_shift, piecewise, expected_sr +): np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10, - method=method) + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=10, method=method, + neg_shift=neg_shift, piecewise=piecewise + ) assert expected_sr == pytest.approx(sr, abs=1e-6), \ f'Soiling ratio with method="{method}" different from expected value' def test_soiling_srr_min_interval_length(soiling_normalized_daily, soiling_insolation): - 'Test that a long minimum interval length prevents finding shorter intervals' + "Test that a long minimum interval length prevents finding shorter intervals" with pytest.raises(NoValidIntervalError): np.random.seed(1977) # normalized_daily intervals are 25 days long, so min=26 should fail: @@ -172,12 +187,12 @@ def test_soiling_srr_min_interval_length(soiling_normalized_daily, soiling_insol def test_soiling_srr_recenter_false(soiling_normalized_daily, soiling_insolation): np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10, - recenter=False) - assert 1 == soiling_info['renormalizing_factor'], \ - 'Renormalizing factor != 1 with recenter=False' + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=10, recenter=False) + assert (1 == soiling_info["renormalizing_factor"]), \ + "Renormalizing factor != 1 with recenter=False" assert 0.966387 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different than expected when recenter=False' + "Soiling ratio different than expected when recenter=False" def test_soiling_srr_negative_step(soiling_normalized_daily, soiling_insolation): @@ -185,97 +200,107 @@ def test_soiling_srr_negative_step(soiling_normalized_daily, soiling_insolation) stepped_daily.iloc[37:] = stepped_daily.iloc[37:] - 0.1 np.random.seed(1977) - with pytest.warns(UserWarning, match='20% or more of the daily data'): + with pytest.warns(UserWarning, match="20% or more of the daily data"): sr, sr_ci, soiling_info = soiling_srr(stepped_daily, soiling_insolation, reps=10) - assert list(soiling_info['soiling_interval_summary']['valid'].values) == [True, False, True], \ - 'Soiling interval validity differs from expected when a large negative step\ - is incorporated into the data' + assert list(soiling_info["soiling_interval_summary"]["valid"].values) == [ + True, False, True], \ + "Soiling interval validity differs from expected when a large negative step\ + is incorporated into the data" assert 0.936932 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different from expected when a large negative step is incorporated into the data' # noqa: E501 + "Soiling ratio different from expected when a large negative step is\ + incorporated into the data" def test_soiling_srr_max_negative_slope_error(soiling_normalized_daily, soiling_insolation): np.random.seed(1977) - with pytest.warns(UserWarning, match='20% or more of the daily data'): + with pytest.warns(UserWarning, match="20% or more of the daily data"): sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10, max_relative_slope_error=45.0) - assert list(soiling_info['soiling_interval_summary']['valid'].values) == [True, True, False], \ - 'Soiling interval validity differs from expected when max_relative_slope_error=45.0' + assert list(soiling_info["soiling_interval_summary"]["valid"].values) == [ + True, True, False], \ + "Soiling interval validity differs from expected when max_relative_slope_error=45.0" assert 0.958761 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different from expected when max_relative_slope_error=45.0' + "Soiling ratio different from expected when max_relative_slope_error=45.0" def test_soiling_srr_with_nan_interval(soiling_normalized_daily, soiling_insolation): - ''' + """ Previous versions had a bug which would have raised an error when an entire interval was NaN. See https://github.com/NREL/rdtools/issues/129 - ''' + """ reps = 10 normalized_corrupt = soiling_normalized_daily.copy() normalized_corrupt[26:50] = np.nan np.random.seed(1977) - with pytest.warns(UserWarning, match='20% or more of the daily data'): + with pytest.warns(UserWarning, match="20% or more of the daily data"): sr, sr_ci, soiling_info = soiling_srr(normalized_corrupt, soiling_insolation, reps=reps) assert 0.948792 == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different from expected value when an entire interval was NaN' + "Soiling ratio different from expected value when an entire interval was NaN" + + with pytest.warns(UserWarning, match="20% or more of the daily data"): + sr, sr_ci, soiling_info = soiling_srr(normalized_corrupt, soiling_insolation, + reps=reps, method="perfect_clean_complex", + piecewise=True, neg_shift=True) + assert 0.974225 == pytest.approx(sr, abs=1e-6), \ + "Soiling ratio different from expected value when an entire interval was NaN" def test_soiling_srr_outlier_factor(soiling_normalized_daily, soiling_insolation): - _, _, info = soiling_srr(soiling_normalized_daily, soiling_insolation, - reps=1, outlier_factor=8) - assert len(info['soiling_interval_summary']) == 2, \ - 'Increasing the outlier_factor did not result in the expected number of soiling intervals' + _, _, info = soiling_srr( + soiling_normalized_daily, soiling_insolation, reps=1, outlier_factor=8 + ) + assert (len(info["soiling_interval_summary"]) == 2), \ + "Increasing the outlier_factor did not result in the expected number of soiling intervals" def test_soiling_srr_kwargs(monkeypatch, soiling_normalized_daily, soiling_insolation): - ''' + """ Make sure that all soiling_srr parameters get passed on to SRRAnalysis and SRRAnalysis.run(), i.e. all necessary inputs to SRRAnalysis are provided by soiling_srr. Done by removing the SRRAnalysis default param values and making sure everything still runs. - ''' + """ # the __defaults__ attr is the tuple of default values in py3 monkeypatch.delattr(SRRAnalysis.__init__, "__defaults__") monkeypatch.delattr(SRRAnalysis.run, "__defaults__") _ = soiling_srr(soiling_normalized_daily, soiling_insolation, reps=10) -@pytest.mark.parametrize(('start,expected_sr'), - [(18, 0.984779), (17, 0.981258)]) -def test_soiling_srr_min_interval_length_default(soiling_normalized_daily, soiling_insolation, - start, expected_sr): - ''' +@pytest.mark.parametrize(("start,expected_sr"), [(18, 0.984779), (17, 0.981258)]) +def test_soiling_srr_min_interval_length_default( + soiling_normalized_daily, soiling_insolation, start, expected_sr): + """ Make sure that the default value of min_interval_length is 7 days by testing on a cropped version of the example data - ''' + """ reps = 10 np.random.seed(1977) - sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily[start:], - soiling_insolation[start:], reps=reps) + sr, sr_ci, soiling_info = soiling_srr( + soiling_normalized_daily[start:], soiling_insolation[start:], reps=reps + ) assert expected_sr == pytest.approx(sr, abs=1e-6), \ - 'Soiling ratio different from expected value' + "Soiling ratio different from expected value" -@pytest.mark.parametrize('test_param', ['energy_normalized_daily', - 'insolation_daily', - 'precipitation_daily']) +@pytest.mark.parametrize( + "test_param", ["energy_normalized_daily", "insolation_daily", "precipitation_daily"]) def test_soiling_srr_non_daily_inputs(test_param): - ''' + """ Validate the frequency check for input time series - ''' - dummy_daily_explicit = pd.Series(0, index=pd.date_range('2019-01-01', periods=10, freq='d')) - dummy_daily_implicit = pd.Series(0, index=pd.date_range('2019-01-01', periods=10, freq='d')) + """ + dummy_daily_explicit = pd.Series(0, index=pd.date_range("2019-01-01", periods=10, freq="d")) + dummy_daily_implicit = pd.Series(0, index=pd.date_range("2019-01-01", periods=10, freq="d")) dummy_daily_implicit.index.freq = None dummy_nondaily = pd.Series(0, index=dummy_daily_explicit.index[::2]) kwargs = { - 'energy_normalized_daily': dummy_daily_explicit, - 'insolation_daily': dummy_daily_explicit, - 'precipitation_daily': dummy_daily_explicit, + "energy_normalized_daily": dummy_daily_explicit, + "insolation_daily": dummy_daily_explicit, + "precipitation_daily": dummy_daily_explicit, } # no error for implicit daily inputs kwargs[test_param] = dummy_daily_implicit @@ -283,27 +308,104 @@ def test_soiling_srr_non_daily_inputs(test_param): # yes error for non-daily inputs kwargs[test_param] = dummy_nondaily - with pytest.raises(ValueError, match='must have daily frequency'): + with pytest.raises(ValueError, match="must have daily frequency"): _ = SRRAnalysis(**kwargs) def test_soiling_srr_argument_checks(soiling_normalized_daily, soiling_insolation): - ''' + """ Make sure various argument validation warnings and errors are raised - ''' + """ kwargs = { - 'energy_normalized_daily': soiling_normalized_daily, - 'insolation_daily': soiling_insolation, - 'reps': 10 + "energy_normalized_daily": soiling_normalized_daily, + "insolation_daily": soiling_insolation, + "reps": 10, } - with pytest.warns(UserWarning, match='An even value of day_scale was passed'): + with pytest.warns(UserWarning, match="An even value of day_scale was passed"): _ = soiling_srr(day_scale=12, **kwargs) - with pytest.raises(ValueError, match='clean_criterion must be one of'): - _ = soiling_srr(clean_criterion='bad', **kwargs) + with pytest.raises(ValueError, match="clean_criterion must be one of"): + _ = soiling_srr(clean_criterion="bad", **kwargs) - with pytest.raises(ValueError, match='Invalid method specification'): - _ = soiling_srr(method='bad', **kwargs) + with pytest.raises(ValueError, match="Invalid method specification"): + _ = soiling_srr(method="bad", **kwargs) + + +# ########################### +# negetive shift and piecewise tests +# ########################### +@pytest.mark.parametrize( + "method,neg_shift,expected_sr", + [("half_norm_clean", False, 0.980143), + ("half_norm_clean", True, 0.975057), + ("perfect_clean_complex", True, 0.964117), + ("inferred_clean_complex", True, 0.963585)]) +def test_negative_shifts( + soiling_normalized_daily_with_neg_shifts, soiling_insolation, soiling_times, + method, neg_shift, expected_sr): + reps = 10 + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_neg_shifts, + soiling_insolation, reps=reps, + method=method, neg_shift=neg_shift) + assert expected_sr == pytest.approx(sr, abs=1e-6), \ + f'Soiling ratio with method="{method}" and neg_shift="{neg_shift}" \ + different from expected value' + + +@pytest.mark.parametrize( + "method,piecewise,expected_sr", + [("half_norm_clean", False, 0.8670264), + ("half_norm_clean", True, 0.927017), + ("perfect_clean_complex", True, 0.896936), + ("inferred_clean_complex", True, 0.896214)]) +def test_piecewise(soiling_normalized_daily_with_piecewise_slope, soiling_insolation, + soiling_times, method, piecewise, expected_sr): + reps = 10 + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_piecewise_slope, + soiling_insolation, reps=reps, method=method, + piecewise=piecewise) + assert expected_sr == pytest.approx(sr, abs=1e-6), \ + f'Soiling ratio with method="{method}" and piecewise="{piecewise}" \ + different from expected value' + + +def test_piecewise_and_neg_shifts(soiling_normalized_daily_with_piecewise_slope, + soiling_normalized_daily_with_neg_shifts, + soiling_insolation, soiling_times): + reps = 10 + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_piecewise_slope, + soiling_insolation, reps=reps, + method="perfect_clean_complex", piecewise=True, + neg_shift=True) + assert 0.896936 == pytest.approx(sr, abs=1e-6), \ + "Soiling ratio different from expected value for data with piecewise slopes" + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_neg_shifts, + soiling_insolation, reps=reps, + method="perfect_clean_complex", piecewise=True, + neg_shift=True) + assert 0.964117 == pytest.approx(sr, abs=1e-6), \ + "Soiling ratio different from expected value for data with negative shifts" + + +def test_complex_sr_clean_threshold(soiling_normalized_daily_with_neg_shifts, soiling_insolation): + """Test that clean test_soiling_srr_clean_threshold works with a float and + can cause no soiling intervals to be found""" + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_neg_shifts, + soiling_insolation, reps=10, clean_threshold=0.1, + method="perfect_clean_complex", piecewise=True, + neg_shift=True) + assert 0.934926 == pytest.approx(sr, abs=1e-6), \ + "Soiling ratio with specified clean_threshold different from expected value" + + with pytest.raises(NoValidIntervalError): + np.random.seed(1977) + sr, sr_ci, soiling_info = soiling_srr(soiling_normalized_daily_with_neg_shifts, + soiling_insolation, reps=10, clean_threshold=1) # ########################### @@ -313,25 +415,24 @@ def test_soiling_srr_argument_checks(soiling_normalized_daily, soiling_insolatio @pytest.fixture() def multi_year_profiles(): - times = pd.date_range('01-01-2018', '11-30-2019', freq='D') - data = np.array([0]*365 + [10]*334) + times = pd.date_range("01-01-2018", "11-30-2019", freq="D") + data = np.array([0] * 365 + [10] * 334) profiles = [pd.Series(x + data, times) for x in range(10)] # make insolation slighly longer to test for proper normalization - times = pd.date_range('01-01-2018', '12-31-2019', freq='D') - insolation = 350*[0.8] + (len(times)-350)*[1] + times = pd.date_range("01-01-2018", "12-31-2019", freq="D") + insolation = 350 * [0.8] + (len(times) - 350) * [1] insolation = pd.Series(insolation, index=times) return profiles, insolation def test_annual_soiling_ratios(multi_year_profiles): - expected_data = np.array([[2018, 4.5, 1.431, 7.569], - [2019, 14.5, 11.431, 17.569]]) - expected = pd.DataFrame(data=expected_data, - columns=['year', 'soiling_ratio_median', 'soiling_ratio_low', - 'soiling_ratio_high']) - expected['year'] = expected['year'].astype(int) + expected_data = np.array([[2018, 4.5, 1.431, 7.569], [2019, 14.5, 11.431, 17.569]]) + expected = pd.DataFrame( + data=expected_data, + columns=["year", "soiling_ratio_median", "soiling_ratio_low", "soiling_ratio_high"]) + expected["year"] = expected["year"].astype(int) srr_profiles, insolation = multi_year_profiles result = annual_soiling_ratios(srr_profiles, insolation) @@ -340,12 +441,11 @@ def test_annual_soiling_ratios(multi_year_profiles): def test_annual_soiling_ratios_confidence_interval(multi_year_profiles): - expected_data = np.array([[2018, 4.5, 0.225, 8.775], - [2019, 14.5, 10.225, 18.775]]) - expected = pd.DataFrame(data=expected_data, - columns=['year', 'soiling_ratio_median', 'soiling_ratio_low', - 'soiling_ratio_high']) - expected['year'] = expected['year'].astype(int) + expected_data = np.array([[2018, 4.5, 0.225, 8.775], [2019, 14.5, 10.225, 18.775]]) + expected = pd.DataFrame( + data=expected_data, + columns=["year", "soiling_ratio_median", "soiling_ratio_low", "soiling_ratio_high"]) + expected["year"] = expected["year"].astype(int) srr_profiles, insolation = multi_year_profiles result = annual_soiling_ratios(srr_profiles, insolation, confidence_level=95) @@ -356,9 +456,10 @@ def test_annual_soiling_ratios_confidence_interval(multi_year_profiles): def test_annual_soiling_ratios_warning(multi_year_profiles): srr_profiles, insolation = multi_year_profiles insolation = insolation.iloc[:-200] - match = ('The indexes of stochastic_soiling_profiles are not entirely contained ' - 'within the index of insolation_daily. Every day in stochastic_soiling_profiles ' - 'should be represented in insolation_daily. This may cause erroneous results.') + match = ( + "The indexes of stochastic_soiling_profiles are not entirely contained " + "within the index of insolation_daily. Every day in stochastic_soiling_profiles " + "should be represented in insolation_daily. This may cause erroneous results.") with pytest.warns(UserWarning, match=match): _ = annual_soiling_ratios(srr_profiles, insolation) @@ -370,41 +471,42 @@ def test_annual_soiling_ratios_warning(multi_year_profiles): @pytest.fixture() def soiling_interval_summary(): - starts = ['2019/01/01', '2019/01/16', '2019/02/08', '2019/03/06'] - starts = pd.to_datetime(starts).tz_localize('America/Denver') - ends = ['2019/01/15', '2019/02/07', '2019/03/05', '2019/04/07'] - ends = pd.to_datetime(ends).tz_localize('America/Denver') + starts = ["2019/01/01", "2019/01/16", "2019/02/08", "2019/03/06"] + starts = pd.to_datetime(starts).tz_localize("America/Denver") + ends = ["2019/01/15", "2019/02/07", "2019/03/05", "2019/04/07"] + ends = pd.to_datetime(ends).tz_localize("America/Denver") slopes = [-0.005, -0.002, -0.001, -0.002] slopes_low = [-0.0055, -0.0025, -0.0015, -0.003] slopes_high = [-0.004, 0, 0, -0.001] valids = [True, True, False, True] soiling_interval_summary = pd.DataFrame() - soiling_interval_summary['start'] = starts - soiling_interval_summary['end'] = ends - soiling_interval_summary['soiling_rate'] = slopes - soiling_interval_summary['soiling_rate_low'] = slopes_low - soiling_interval_summary['soiling_rate_high'] = slopes_high - soiling_interval_summary['inferred_start_loss'] = np.nan - soiling_interval_summary['inferred_end_loss'] = np.nan - soiling_interval_summary['length'] = (ends - starts).days - soiling_interval_summary['valid'] = valids + soiling_interval_summary["start"] = starts + soiling_interval_summary["end"] = ends + soiling_interval_summary["soiling_rate"] = slopes + soiling_interval_summary["soiling_rate_low"] = slopes_low + soiling_interval_summary["soiling_rate_high"] = slopes_high + soiling_interval_summary["inferred_start_loss"] = np.nan + soiling_interval_summary["inferred_end_loss"] = np.nan + soiling_interval_summary["length"] = (ends - starts).days + soiling_interval_summary["valid"] = valids return soiling_interval_summary def _build_monthly_summary(top_rows): - ''' + """ Convienience function to build a full monthly soiling summary dataframe from the expected_top_rows which summarize Jan-April - ''' + """ - all_rows = np.vstack((top_rows, [[1, np.nan, np.nan, np.nan, 0]]*8)) + all_rows = np.vstack((top_rows, [[1, np.nan, np.nan, np.nan, 0]] * 8)) - df = pd.DataFrame(data=all_rows, - columns=['month', 'soiling_rate_median', 'soiling_rate_low', - 'soiling_rate_high', 'interval_count']) - df['month'] = range(1, 13) + df = pd.DataFrame( + data=all_rows, + columns=["month", "soiling_rate_median", "soiling_rate_low", + "soiling_rate_high", "interval_count"]) + df["month"] = range(1, 13) return df @@ -413,11 +515,11 @@ def test_monthly_soiling_rates(soiling_interval_summary): np.random.seed(1977) result = monthly_soiling_rates(soiling_interval_summary) - expected = np.array([ - [1.00000000e+00, -2.42103810e-03, -5.00912766e-03, -7.68551806e-04, 2.00000000e+00], - [2.00000000e+00, -1.25092837e-03, -2.10091842e-03, -3.97354321e-04, 1.00000000e+00], - [3.00000000e+00, -2.00313359e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e+00], - [4.00000000e+00, -1.99729563e-03, -2.68067699e-03, -1.31667446e-03, 1.00000000e+00]]) + expected = np.array( + [[1.00000000e00, -2.42103810e-03, -5.00912766e-03, -7.68551806e-04, 2.00000000e00], + [2.00000000e00, -1.25092837e-03, -2.10091842e-03, -3.97354321e-04, 1.00000000e00], + [3.00000000e00, -2.00313359e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e00], + [4.00000000e00, -1.99729563e-03, -2.68067699e-03, -1.31667446e-03, 1.00000000e00]]) expected = _build_monthly_summary(expected) pd.testing.assert_frame_equal(result, expected, check_dtype=False) @@ -427,11 +529,11 @@ def test_monthly_soiling_rates_min_interval_length(soiling_interval_summary): np.random.seed(1977) result = monthly_soiling_rates(soiling_interval_summary, min_interval_length=20) - expected = np.array([ - [1.00000000e+00, -1.24851539e-03, -2.10394564e-03, -3.98358211e-04, 1.00000000e+00], - [2.00000000e+00, -1.25092837e-03, -2.10091842e-03, -3.97330424e-04, 1.00000000e+00], - [3.00000000e+00, -2.00309454e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e+00], - [4.00000000e+00, -1.99729563e-03, -2.68067699e-03, -1.31667446e-03, 1.00000000e+00]]) + expected = np.array( + [[1.00000000e00, -1.24851539e-03, -2.10394564e-03, -3.98358211e-04, 1.00000000e00], + [2.00000000e00, -1.25092837e-03, -2.10091842e-03, -3.97330424e-04, 1.00000000e00], + [3.00000000e00, -2.00309454e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e00], + [4.00000000e00, -1.99729563e-03, -2.68067699e-03, -1.31667446e-03, 1.00000000e00]]) expected = _build_monthly_summary(expected) pd.testing.assert_frame_equal(result, expected, check_dtype=False) @@ -439,13 +541,15 @@ def test_monthly_soiling_rates_min_interval_length(soiling_interval_summary): def test_monthly_soiling_rates_max_slope_err(soiling_interval_summary): np.random.seed(1977) - result = monthly_soiling_rates(soiling_interval_summary, max_relative_slope_error=120) - - expected = np.array([ - [1.00000000e+00, -4.74910923e-03, -5.26236739e-03, -4.23901493e-03, 1.00000000e+00], - [2.00000000e+00, np.nan, np.nan, np.nan, 0.00000000e+00], - [3.00000000e+00, -2.00074270e-03, -2.68073474e-03, -1.31786434e-03, 1.00000000e+00], - [4.00000000e+00, -2.00309454e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e+00]]) + result = monthly_soiling_rates( + soiling_interval_summary, max_relative_slope_error=120 + ) + + expected = np.array( + [[1.00000000e00, -4.74910923e-03, -5.26236739e-03, -4.23901493e-03, 1.00000000e00], + [2.00000000e00, np.nan, np.nan, np.nan, 0.00000000e00], + [3.00000000e00, -2.00074270e-03, -2.68073474e-03, -1.31786434e-03, 1.00000000e00], + [4.00000000e00, -2.00309454e-03, -2.68359541e-03, -1.31927678e-03, 1.00000000e00]]) expected = _build_monthly_summary(expected) pd.testing.assert_frame_equal(result, expected, check_dtype=False) @@ -455,11 +559,11 @@ def test_monthly_soiling_rates_confidence_level(soiling_interval_summary): np.random.seed(1977) result = monthly_soiling_rates(soiling_interval_summary, confidence_level=95) - expected = np.array([ - [1.00000000e+00, -2.42103810e-03, -5.42313113e-03, -1.21156562e-04, 2.00000000e+00], - [2.00000000e+00, -1.25092837e-03, -2.43731574e-03, -6.23842627e-05, 1.00000000e+00], - [3.00000000e+00, -2.00313359e-03, -2.94998476e-03, -1.04988760e-03, 1.00000000e+00], - [4.00000000e+00, -1.99729563e-03, -2.95063841e-03, -1.04869949e-03, 1.00000000e+00]]) + expected = np.array( + [[1.00000000e00, -2.42103810e-03, -5.42313113e-03, -1.21156562e-04, 2.00000000e00], + [2.00000000e00, -1.25092837e-03, -2.43731574e-03, -6.23842627e-05, 1.00000000e00], + [3.00000000e00, -2.00313359e-03, -2.94998476e-03, -1.04988760e-03, 1.00000000e00], + [4.00000000e00, -1.99729563e-03, -2.95063841e-03, -1.04869949e-03, 1.00000000e00]]) expected = _build_monthly_summary(expected) @@ -470,12 +574,95 @@ def test_monthly_soiling_rates_reps(soiling_interval_summary): np.random.seed(1977) result = monthly_soiling_rates(soiling_interval_summary, reps=3) - expected = np.array([ - [1.00000000e+00, -2.88594088e-03, -5.03736679e-03, -6.47391131e-04, 2.00000000e+00], - [2.00000000e+00, -1.67359565e-03, -2.00504171e-03, -1.33240044e-03, 1.00000000e+00], - [3.00000000e+00, -1.22306993e-03, -2.19274892e-03, -1.11793240e-03, 1.00000000e+00], - [4.00000000e+00, -1.94675549e-03, -2.42574164e-03, -1.54850795e-03, 1.00000000e+00]]) + expected = np.array( + [[1.00000000e00, -2.88594088e-03, -5.03736679e-03, -6.47391131e-04, 2.00000000e00], + [2.00000000e00, -1.67359565e-03, -2.00504171e-03, -1.33240044e-03, 1.00000000e00], + [3.00000000e00, -1.22306993e-03, -2.19274892e-03, -1.11793240e-03, 1.00000000e00], + [4.00000000e00, -1.94675549e-03, -2.42574164e-03, -1.54850795e-03, 1.00000000e00]]) expected = _build_monthly_summary(expected) pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +# ###################################### +# invalid segmented_soiling_period tests +# ###################################### + + +@pytest.fixture +def pr_series(): + """ + Panda series of daily performance ratios measured during the given deposition period + with datetime index and is length 10. + """ + pr_idx = pd.date_range(start="2022-01-01", periods=10, freq="D") + pr_series = pd.Series(np.random.rand(10), index=pr_idx) + return pr_series + + +def test_no_datetime_index_pr(pr_series): + """ + Tests if ValueError is raised when pr_series does not have datetime index. + """ + pr = pr_series.reset_index() + with pytest.raises(ValueError, match="The time series does not have DatetimeIndex"): + _ = segmented_soiling_period(pr) + + +def test_no_change_point(pr_series): + """ + Tests if no change point was found when fitting soiling profile with segmentation. + """ + days_clean_vs_cp = 7 + result_sr, result_cp_date = segmented_soiling_period(pr_series, + days_clean_vs_cp=days_clean_vs_cp) + expected_sr = pd.Series([np.nan]*len(pr_series), index=pr_series.index) + expected_cp_date = None + + pd.testing.assert_series_equal(result_sr, expected_sr) + assert result_cp_date == expected_cp_date + + +def test_except_block(): + """ + Tests except block for when all segementation methods did not work. + """ + pr_idx = pd.date_range(start="2022-01-01", periods=5, freq="D") + pr_series = pd.Series(np.array([1, 2, 3, 4, 5]), index=pr_idx) + result_sr, result_cp_date = segmented_soiling_period(pr_series) + + expected_sr = pd.Series([np.nan]*len(pr_series), index=pr_series.index) + expected_cp_date = None + + pd.testing.assert_series_equal(result_sr, expected_sr) + assert result_cp_date == expected_cp_date + + +def test_short_segmentation_periods(): + """ + Tests if segmentation fails for short soiling periods. + """ + pr_idx = pd.date_range(start="2022-01-01", periods=35, freq="D") + pr_series = pd.Series(np.random.normal(loc=5, scale=2, size=35), index=pr_idx) + result_sr, result_cp_date = segmented_soiling_period(pr_series) + + expected_sr = pd.Series([np.nan]*len(pr_series), index=pr_series.index) + expected_cp_date = None + + pd.testing.assert_series_equal(result_sr, expected_sr) + assert result_cp_date == expected_cp_date + + +def test_long_segmentation_periods(): + "Tests if segmentation fails for longer soiling periods." + pr_idx = pd.date_range(start="2022-01-01", periods=47, freq="D") + testing_list = list(np.arange(46)) + [50] + pr_series = pd.Series(testing_list, index=pr_idx) + result_sr, result_cp_date = segmented_soiling_period(pr_series) + + expected_sr = pd.Series([np.nan]*len(pr_series), index=pr_series.index) + expected_cp_date = None + + pd.testing.assert_series_equal(result_sr, expected_sr) + assert result_cp_date == expected_cp_date