From 13067ff3dcc34f67f66e6e6815818a67188896f0 Mon Sep 17 00:00:00 2001 From: martin-springer Date: Wed, 10 Apr 2024 09:42:06 -0400 Subject: [PATCH] fix linting --- rdtools/analysis_chains.py | 791 +++++++++++++++++++++---------------- rdtools/filtering.py | 668 +++++++++++++++++-------------- 2 files changed, 826 insertions(+), 633 deletions(-) diff --git a/rdtools/analysis_chains.py b/rdtools/analysis_chains.py index 8ffa750e6..7c613149f 100644 --- a/rdtools/analysis_chains.py +++ b/rdtools/analysis_chains.py @@ -1,7 +1,8 @@ -''' +""" This module contains functions and classes for object-oriented end-to-end analysis -''' +""" + import pvlib import pandas as pd import numpy as np @@ -11,8 +12,8 @@ import warnings -class TrendAnalysis(): - ''' +class TrendAnalysis: + """ Class for end-to-end degradation and soiling analysis using :py:meth:`~rdtools.TrendAnalysis.sensor_analysis` or :py:meth:`~rdtools.TrendAnalysis.clearsky_analysis` @@ -85,37 +86,51 @@ class TrendAnalysis(): defaults to None. See examples for more information. results : dict Nested dict used to store the results of methods ending with `_analysis` - ''' - - def __init__(self, pv, poa_global=None, temperature_cell=None, temperature_ambient=None, - gamma_pdc=None, aggregation_freq='D', pv_input='power', - windspeed=0, power_expected=None, temperature_model=None, - power_dc_rated=None, interp_freq=None, max_timedelta=None): + """ + + def __init__( + self, + pv, + poa_global=None, + temperature_cell=None, + temperature_ambient=None, + gamma_pdc=None, + aggregation_freq="D", + pv_input="power", + windspeed=0, + power_expected=None, + temperature_model=None, + power_dc_rated=None, + interp_freq=None, + max_timedelta=None, + ): if interp_freq is not None: pv = normalization.interpolate(pv, interp_freq, max_timedelta) if poa_global is not None: - poa_global = normalization.interpolate( - poa_global, pv.index, max_timedelta) + poa_global = normalization.interpolate(poa_global, pv.index, max_timedelta) if temperature_cell is not None: temperature_cell = normalization.interpolate( - temperature_cell, pv.index, max_timedelta) + temperature_cell, pv.index, max_timedelta + ) if temperature_ambient is not None: temperature_ambient = normalization.interpolate( - temperature_ambient, pv.index, max_timedelta) + temperature_ambient, pv.index, max_timedelta + ) if power_expected is not None: power_expected = normalization.interpolate( - power_expected, pv.index, max_timedelta) + power_expected, pv.index, max_timedelta + ) if isinstance(windspeed, pd.Series): - windspeed = normalization.interpolate( - windspeed, pv.index, max_timedelta) + windspeed = normalization.interpolate(windspeed, pv.index, max_timedelta) - if pv_input == 'power': + if pv_input == "power": self.pv_power = pv self.pv_energy = normalization.energy_from_power( - pv, max_timedelta=max_timedelta) - elif pv_input == 'energy': + pv, max_timedelta=max_timedelta + ) + elif pv_input == "energy": self.pv_power = None self.pv_energy = pv @@ -134,25 +149,30 @@ def __init__(self, pv, poa_global=None, temperature_cell=None, temperature_ambie # Initialize to use default filter parameters self.filter_params = { - 'normalized_filter': {}, - 'poa_filter': {}, - 'tcell_filter': {}, - 'clip_filter': {}, - 'pvlib_clearsky_filter': {}, - 'ad_hoc_filter': None # use this to include an explict filter - } - self.filter_params_aggregated = { - 'ad_hoc_filter': None + "normalized_filter": {}, + "poa_filter": {}, + "tcell_filter": {}, + "clip_filter": {}, + "pvlib_clearsky_filter": {}, + "ad_hoc_filter": None, # use this to include an explict filter } + self.filter_params_aggregated = {"ad_hoc_filter": None} # remove tcell_filter from list if power_expected is passed in if power_expected is not None and temperature_cell is None: - del self.filter_params['tcell_filter'] - - def set_clearsky(self, pvlib_location=None, pv_azimuth=None, pv_tilt=None, - poa_global_clearsky=None, temperature_cell_clearsky=None, - temperature_ambient_clearsky=None, albedo=0.25, - solar_position_method='nrel_numpy'): - ''' + del self.filter_params["tcell_filter"] + + def set_clearsky( + self, + pvlib_location=None, + pv_azimuth=None, + pv_tilt=None, + poa_global_clearsky=None, + temperature_cell_clearsky=None, + temperature_ambient_clearsky=None, + albedo=0.25, + solar_position_method="nrel_numpy", + ): + """ Initialize values for a clearsky analysis which requires configuration of location and orientation details. If optional parameters `poa_global_clearsky`, `temperature_ambient_clearsky` are not passed, they will be modeled @@ -183,24 +203,29 @@ def set_clearsky(self, pvlib_location=None, pv_azimuth=None, pv_tilt=None, solar_position_method : str, default 'nrel_numpy' Optional method name to pass to :py:func:`pvlib.solarposition.get_solarposition`. Switching methods may improve calculation time. - ''' + """ max_timedelta = self.max_timedelta if poa_global_clearsky is not None: poa_global_clearsky = normalization.interpolate( - poa_global_clearsky, self.pv_energy.index, max_timedelta) + poa_global_clearsky, self.pv_energy.index, max_timedelta + ) if temperature_cell_clearsky is not None: temperature_cell_clearsky = normalization.interpolate( - temperature_cell_clearsky, self.pv_energy.index, max_timedelta) + temperature_cell_clearsky, self.pv_energy.index, max_timedelta + ) if temperature_ambient_clearsky is not None: temperature_ambient_clearsky = normalization.interpolate( - temperature_ambient_clearsky, self.pv_energy.index, max_timedelta) + temperature_ambient_clearsky, self.pv_energy.index, max_timedelta + ) if isinstance(pv_azimuth, (pd.Series, pd.DataFrame)): pv_azimuth = normalization.interpolate( - pv_azimuth, self.pv_energy.index, max_timedelta) + pv_azimuth, self.pv_energy.index, max_timedelta + ) if isinstance(pv_tilt, (pd.Series, pd.DataFrame)): pv_tilt = normalization.interpolate( - pv_tilt, self.pv_energy.index, max_timedelta) + pv_tilt, self.pv_energy.index, max_timedelta + ) self.pvlib_location = pvlib_location self.pv_azimuth = pv_azimuth @@ -212,7 +237,7 @@ def set_clearsky(self, pvlib_location=None, pv_azimuth=None, pv_tilt=None, self.solar_position_method = solar_position_method def _calc_clearsky_poa(self, times=None, rescale=True, **kwargs): - ''' + """ Calculate clearsky plane-of-array irradiance and stores in self.poa_global_clearsky Parameters @@ -229,43 +254,46 @@ def _calc_clearsky_poa(self, times=None, rescale=True, **kwargs): Returns ------- None - ''' + """ aggregate = False if times is None: - times = pd.date_range(self.poa_global.index.min(), self.poa_global.index.max(), - freq='1min') + times = pd.date_range( + self.poa_global.index.min(), self.poa_global.index.max(), freq="1min" + ) aggregate = True - if not hasattr(self, 'pvlib_location'): + if not hasattr(self, "pvlib_location"): + raise ValueError("pvlib location must be provided using set_clearsky()") + if not hasattr(self, "pv_tilt") or not hasattr(self, "pv_azimuth"): raise ValueError( - 'pvlib location must be provided using set_clearsky()') - if not hasattr(self, 'pv_tilt') or not hasattr(self, 'pv_azimuth'): - raise ValueError( - 'pv_tilt and pv_azimuth must be provided using set_clearsky()') + "pv_tilt and pv_azimuth must be provided using set_clearsky()" + ) loc = self.pvlib_location solar_position_kwargs = {} if self.solar_position_method: - solar_position_kwargs['method'] = self.solar_position_method + solar_position_kwargs["method"] = self.solar_position_method sun = loc.get_solarposition(times, **solar_position_kwargs) clearsky = loc.get_clearsky(times, solar_position=sun) clearsky_poa = pvlib.irradiance.get_total_irradiance( self.pv_tilt, self.pv_azimuth, - sun['apparent_zenith'], - sun['azimuth'], - clearsky['dni'], - clearsky['ghi'], - clearsky['dhi'], + sun["apparent_zenith"], + sun["azimuth"], + clearsky["dni"], + clearsky["ghi"], + clearsky["dhi"], albedo=self.albedo, - **kwargs) - clearsky_poa = clearsky_poa['poa_global'] + **kwargs, + ) + clearsky_poa = clearsky_poa["poa_global"] if aggregate: interval_id = pd.Series( - range(len(self.poa_global)), index=self.poa_global.index) - interval_id = interval_id.reindex(times, method='backfill') + range(len(self.poa_global)), index=self.poa_global.index + ) + interval_id = interval_id.reindex(times, method="backfill") clearsky_poa = clearsky_poa.groupby(interval_id).mean() clearsky_poa.index = self.poa_global.index clearsky_poa.iloc[0] = np.nan @@ -273,15 +301,17 @@ def _calc_clearsky_poa(self, times=None, rescale=True, **kwargs): if rescale is True: if not clearsky_poa.index.equals(self.poa_global.index): raise ValueError( - 'rescale=True can only be used when clearsky poa is on same index as poa') + "rescale=True can only be used when clearsky poa is on same index as poa" + ) clearsky_poa = normalization.irradiance_rescale( - self.poa_global, clearsky_poa, method='iterative') + self.poa_global, clearsky_poa, method="iterative" + ) self.poa_global_clearsky = clearsky_poa def _calc_cell_temperature(self, poa_global, temperature_ambient, windspeed): - ''' + """ Return cell temperature calculated from ambient conditions. Parameters @@ -297,7 +327,7 @@ def _calc_cell_temperature(self, poa_global, temperature_ambient, windspeed): ------- numeric calculated cell temperature - ''' + """ try: # workflow for pvlib >= 0.7 @@ -306,44 +336,50 @@ def _calc_cell_temperature(self, poa_global, temperature_ambient, windspeed): # check if self.temperature_model is a string or dict with keys 'a', 'b' and 'deltaT' if isinstance(self.temperature_model, str): - model_params = pvlib.temperature.TEMPERATURE_MODEL_PARAMETERS[ - 'sapm'][self.temperature_model] - elif (isinstance(self.temperature_model, dict) & - ('a' in self.temperature_model) & - ('b' in self.temperature_model) & - ('deltaT' in self.temperature_model)): + model_params = pvlib.temperature.TEMPERATURE_MODEL_PARAMETERS["sapm"][ + self.temperature_model + ] + elif ( + isinstance(self.temperature_model, dict) + & ("a" in self.temperature_model) + & ("b" in self.temperature_model) + & ("deltaT" in self.temperature_model) + ): model_params = self.temperature_model else: - raise ValueError('pvlib temperature_model entry is neither ' - 'a string nor a dictionary with correct ' - 'entries. Try "open_rack_glass_polymer"') - cell_temp = pvlib.temperature.sapm_cell(poa_global=poa_global, - temp_air=temperature_ambient, - wind_speed=windspeed, - **model_params - ) + raise ValueError( + "pvlib temperature_model entry is neither " + "a string nor a dictionary with correct " + 'entries. Try "open_rack_glass_polymer"' + ) + cell_temp = pvlib.temperature.sapm_cell( + poa_global=poa_global, + temp_air=temperature_ambient, + wind_speed=windspeed, + **model_params, + ) except AttributeError as e: - print('Error: PVLib > 0.7 required') + print("Error: PVLib > 0.7 required") raise e return cell_temp def _calc_clearsky_tamb(self): - ''' + """ Calculate clear-sky ambient temperature and store in self.temperature_ambient_clearsky - ''' + """ times = self.poa_global_clearsky.index - if not hasattr(self, 'pvlib_location'): - raise ValueError( - 'pvlib_location must be provided using set_clearsky()') + if not hasattr(self, "pvlib_location"): + raise ValueError("pvlib_location must be provided using set_clearsky()") loc = self.pvlib_location cs_amb_temp = clearsky_temperature.get_clearsky_tamb( - times, loc.latitude, loc.longitude) + times, loc.latitude, loc.longitude + ) self.temperature_ambient_clearsky = cs_amb_temp def _pvwatts_norm(self, poa_global, temperature_cell): - ''' + """ Normalize PV energy to that expected from a PVWatts model. Parameters @@ -359,7 +395,7 @@ def _pvwatts_norm(self, poa_global, temperature_cell): Normalized pv energy pandas.Series Associated insolation - ''' + """ if self.power_dc_rated is None: renorm = True @@ -369,17 +405,22 @@ def _pvwatts_norm(self, poa_global, temperature_cell): power_dc_rated = self.power_dc_rated if self.gamma_pdc is None: - warnings.warn('Temperature coefficient not passed in to TrendAnalysis' - '. No temperature correction will be conducted.') - pvwatts_kws = {"poa_global": poa_global, - "power_dc_rated": power_dc_rated, - "temperature_cell": temperature_cell, - "poa_global_ref": 1000, - "temperature_cell_ref": 25, - "gamma_pdc": self.gamma_pdc} + warnings.warn( + "Temperature coefficient not passed in to TrendAnalysis" + ". No temperature correction will be conducted." + ) + pvwatts_kws = { + "poa_global": poa_global, + "power_dc_rated": power_dc_rated, + "temperature_cell": temperature_cell, + "poa_global_ref": 1000, + "temperature_cell_ref": 25, + "gamma_pdc": self.gamma_pdc, + } energy_normalized, insolation = normalization.normalize_with_pvwatts( - self.pv_energy, pvwatts_kws) + self.pv_energy, pvwatts_kws + ) if renorm: # Normalize to the 95th percentile for convenience, this is renormalized out @@ -390,7 +431,7 @@ def _pvwatts_norm(self, poa_global, temperature_cell): return energy_normalized, insolation def _filter(self, energy_normalized, case): - ''' + """ Calculate filters based on those in rdtools.filtering. Uses self.filter_params, which is a dict, the keys of which are names of functions in rdtools.filtering, and the values of which are dicts @@ -410,17 +451,21 @@ def _filter(self, energy_normalized, case): Returns ------- None - ''' - + """ # Clearsky filtering subroutine, called either by clearsky analysis, # or sensor analysis using sensor_clearsky_filter def _call_clearsky_filter(filter_string): if self.poa_global is None or self.poa_global_clearsky is None: - raise ValueError('Both poa_global and poa_global_clearsky must be available to ' - f'do clearsky filtering with {filter_string}') + raise ValueError( + "Both poa_global and poa_global_clearsky must be available to " + f"do clearsky filtering with {filter_string}" + ) f = filtering.pvlib_clearsky_filter( - self.poa_global, self.poa_global_clearsky, **self.filter_params[filter_string]) + self.poa_global, + self.poa_global_clearsky, + **self.filter_params[filter_string], + ) return f # Combining filters is non-trivial because of the possibility of index @@ -430,85 +475,100 @@ def _call_clearsky_filter(filter_string): # at once. However, we add a default value of True, with the same index as # energy_normalized, so that the output is still correct even when all # filters have been disabled. - filter_components = {'default': pd.Series( - True, index=energy_normalized.index)} + filter_components = {"default": pd.Series(True, index=energy_normalized.index)} - if case == 'sensor': + if case == "sensor": poa = self.poa_global cell_temp = self.temperature_cell - if case == 'clearsky': + if case == "clearsky": poa = self.poa_global_clearsky cell_temp = self.temperature_cell_clearsky - if 'normalized_filter' in self.filter_params: + if "normalized_filter" in self.filter_params: f = filtering.normalized_filter( - energy_normalized, **self.filter_params['normalized_filter']) - filter_components['normalized_filter'] = f - if 'poa_filter' in self.filter_params: + energy_normalized, **self.filter_params["normalized_filter"] + ) + filter_components["normalized_filter"] = f + if "poa_filter" in self.filter_params: if poa is None: - raise ValueError('poa must be available to use poa_filter') - f = filtering.poa_filter(poa, **self.filter_params['poa_filter']) - filter_components['poa_filter'] = f - if 'tcell_filter' in self.filter_params: + raise ValueError("poa must be available to use poa_filter") + f = filtering.poa_filter(poa, **self.filter_params["poa_filter"]) + filter_components["poa_filter"] = f + if "tcell_filter" in self.filter_params: if cell_temp is None: raise ValueError( - 'Cell temperature must be available to use tcell_filter') - f = filtering.tcell_filter( - cell_temp, **self.filter_params['tcell_filter']) - filter_components['tcell_filter'] = f - if 'clip_filter' in self.filter_params: + "Cell temperature must be available to use tcell_filter" + ) + f = filtering.tcell_filter(cell_temp, **self.filter_params["tcell_filter"]) + filter_components["tcell_filter"] = f + if "clip_filter" in self.filter_params: if self.pv_power is None: - raise ValueError('PV power (not energy) is required for the clipping filter. ' - 'Either omit the clipping filter, provide PV power at ' - 'instantiation, or explicitly assign TrendAnalysis.pv_power.') + raise ValueError( + "PV power (not energy) is required for the clipping filter. " + "Either omit the clipping filter, provide PV power at " + "instantiation, or explicitly assign TrendAnalysis.pv_power." + ) f = filtering.clip_filter( - self.pv_power, **self.filter_params['clip_filter']) - filter_components['clip_filter'] = f - if 'hour_angle_filter' in self.filter_params: - if not hasattr(self, 'pvlib_location'): + self.pv_power, **self.filter_params["clip_filter"] + ) + filter_components["clip_filter"] = f + if "hour_angle_filter" in self.filter_params: + if not hasattr(self, "pvlib_location"): raise ValueError( - 'The pvlib location must be provided using set_clearsky() ' - 'or by directly setting TrendAnalysis.pvlib_location ' - 'in order to use the hour_angle_filter') + "The pvlib location must be provided using set_clearsky() " + "or by directly setting TrendAnalysis.pvlib_location " + "in order to use the hour_angle_filter" + ) loc = self.pvlib_location f = filtering.hour_angle_filter( - energy_normalized, loc.latitude, loc.longitude, - **self.filter_params['hour_angle_filter']) - filter_components['hour_angle_filter'] = f - - if case == 'clearsky': - filter_components['pvlib_clearsky_filter'] = _call_clearsky_filter('pvlib_clearsky_filter') - if 'sensor_pvlib_clearsky_filter' in self.filter_params: - filter_components['sensor_pvlib_clearsky_filter'] = _call_clearsky_filter('sensor_pvlib_clearsky_filter') + energy_normalized, + loc.latitude, + loc.longitude, + **self.filter_params["hour_angle_filter"], + ) + filter_components["hour_angle_filter"] = f + + if case == "clearsky": + filter_components["pvlib_clearsky_filter"] = _call_clearsky_filter( + "pvlib_clearsky_filter" + ) + if "sensor_pvlib_clearsky_filter" in self.filter_params: + filter_components["sensor_pvlib_clearsky_filter"] = _call_clearsky_filter( + "sensor_pvlib_clearsky_filter" + ) # note: the previous implementation using the & operator treated NaN # filter values as False, so we do the same here for consistency: filter_components = pd.DataFrame(filter_components).fillna(False) # apply special checks to ad_hoc_filter, as it is likely more prone to user error - if self.filter_params.get('ad_hoc_filter', None) is not None: - ad_hoc_filter = self.filter_params['ad_hoc_filter'] + if self.filter_params.get("ad_hoc_filter", None) is not None: + ad_hoc_filter = self.filter_params["ad_hoc_filter"] if ad_hoc_filter.isnull().any(): warnings.warn( - 'ad_hoc_filter contains NaN values; setting to False (excluding)') + "ad_hoc_filter contains NaN values; setting to False (excluding)" + ) ad_hoc_filter = ad_hoc_filter.fillna(False) if not filter_components.index.equals(ad_hoc_filter.index): - warnings.warn('ad_hoc_filter index does not match index of other filters; missing ' - 'values will be set to True (kept). Align the index with the index ' - 'of the filter_components attribute to prevent this warning') - ad_hoc_filter = ad_hoc_filter.reindex( - filter_components.index).fillna(True) + warnings.warn( + "ad_hoc_filter index does not match index of other filters; missing " + "values will be set to True (kept). Align the index with the index " + "of the filter_components attribute to prevent this warning" + ) + ad_hoc_filter = ad_hoc_filter.reindex(filter_components.index).fillna( + True + ) - filter_components['ad_hoc_filter'] = ad_hoc_filter + filter_components["ad_hoc_filter"] = ad_hoc_filter bool_filter = filter_components.all(axis=1) - filter_components = filter_components.drop(columns=['default']) - if case == 'sensor': + filter_components = filter_components.drop(columns=["default"]) + if case == "sensor": self.sensor_filter = bool_filter self.sensor_filter_components = filter_components - elif case == 'clearsky': + elif case == "clearsky": self.clearsky_filter = bool_filter self.clearsky_filter_components = filter_components @@ -534,87 +594,99 @@ def _aggregated_filter(self, aggregated, case): ------- None """ - filter_components_aggregated = {'default': - pd.Series(True, index=aggregated.index)} - - if case == 'sensor': + filter_components_aggregated = { + "default": pd.Series(True, index=aggregated.index) + } + + if case == "sensor": insol = self.sensor_aggregated_insolation - if case == 'clearsky': + if case == "clearsky": insol = self.clearsky_aggregated_insolation - + # Add daily aggregate filters as they come online here. - if 'two_way_window_filter' in self.filter_params_aggregated: + if "two_way_window_filter" in self.filter_params_aggregated: f = filtering.two_way_window_filter( - aggregated, **self.filter_params_aggregated['two_way_window_filter']) - filter_components_aggregated['two_way_window_filter'] = f - - if 'insolation_filter' in self.filter_params_aggregated: - f = filtering.insolation_filter( - insol, **self.filter_params_aggregated['insolation_filter']) - filter_components_aggregated['insolation_filter'] = f + aggregated, **self.filter_params_aggregated["two_way_window_filter"] + ) + filter_components_aggregated["two_way_window_filter"] = f - if 'hampel_filter' in self.filter_params_aggregated: - hampelmask = filtering.hampel_filter(aggregated, - **self.filter_params_aggregated['hampel_filter']) - filter_components_aggregated['hampel_filter'] = hampelmask - - if 'directional_tukey_filter' in self.filter_params_aggregated: - f = filtering.directional_tukey_filter(aggregated, - **self.filter_params_aggregated['directional_tukey_filter']) - filter_components_aggregated['directional_tukey_filter'] = f + if "insolation_filter" in self.filter_params_aggregated: + f = filtering.insolation_filter( + insol, **self.filter_params_aggregated["insolation_filter"] + ) + filter_components_aggregated["insolation_filter"] = f + + if "hampel_filter" in self.filter_params_aggregated: + hampelmask = filtering.hampel_filter( + aggregated, **self.filter_params_aggregated["hampel_filter"] + ) + filter_components_aggregated["hampel_filter"] = hampelmask + + if "directional_tukey_filter" in self.filter_params_aggregated: + f = filtering.directional_tukey_filter( + aggregated, **self.filter_params_aggregated["directional_tukey_filter"] + ) + filter_components_aggregated["directional_tukey_filter"] = f # Convert the dictionary into a dataframe (after running filters) filter_components_aggregated = pd.DataFrame( - filter_components_aggregated).fillna(False) + filter_components_aggregated + ).fillna(False) # Run the ad-hoc filter from filter_params_aggregated, if available - if self.filter_params_aggregated.get('ad_hoc_filter', None) is not None: - ad_hoc_filter_aggregated = self.filter_params_aggregated['ad_hoc_filter'] + if self.filter_params_aggregated.get("ad_hoc_filter", None) is not None: + ad_hoc_filter_aggregated = self.filter_params_aggregated["ad_hoc_filter"] if ad_hoc_filter_aggregated.isnull().any(): warnings.warn( - 'aggregated ad_hoc_filter contains NaN values; setting to False (excluding)') + "aggregated ad_hoc_filter contains NaN values; setting to False (excluding)" + ) ad_hoc_filter_aggregated = ad_hoc_filter_aggregated.fillna(False) - if not filter_components_aggregated.index.equals(ad_hoc_filter_aggregated.index): - warnings.warn('Aggregated ad_hoc_filter index does not match index of other ' - 'filters; missing values will be set to True (kept). ' - 'Align the index with the index of the ' - 'filter_components_aggregated attribute to prevent this warning') + if not filter_components_aggregated.index.equals( + ad_hoc_filter_aggregated.index + ): + warnings.warn( + "Aggregated ad_hoc_filter index does not match index of other " + "filters; missing values will be set to True (kept). " + "Align the index with the index of the " + "filter_components_aggregated attribute to prevent this warning" + ) ad_hoc_filter_aggregated = ad_hoc_filter_aggregated.reindex( - filter_components_aggregated.index).fillna(True) + filter_components_aggregated.index + ).fillna(True) - filter_components_aggregated['ad_hoc_filter'] = ad_hoc_filter_aggregated + filter_components_aggregated["ad_hoc_filter"] = ad_hoc_filter_aggregated bool_filter_aggregated = filter_components_aggregated.all(axis=1) filter_components_aggregated = filter_components_aggregated.drop( - columns=['default']) - if case == 'sensor': + columns=["default"] + ) + if case == "sensor": self.sensor_filter_aggregated = bool_filter_aggregated self.sensor_filter_components_aggregated = filter_components_aggregated - elif case == 'clearsky': + elif case == "clearsky": self.clearsky_filter_aggregated = bool_filter_aggregated self.clearsky_filter_components_aggregated = filter_components_aggregated def _filter_check(self, post_filter): - ''' + """ post-filter check for requisite 730 days of data Parameters ---------- post_filter : pandas.Series Time series filtered by boolean output from self.filter - ''' + """ if post_filter.empty: - post_filter_length = pd.Timedelta('0d') + post_filter_length = pd.Timedelta("0d") else: post_filter_length = post_filter.index[-1] - post_filter.index[0] - if post_filter_length < pd.Timedelta('730d'): - raise ValueError( - "Less than two years of data left after filtering") + if post_filter_length < pd.Timedelta("730d"): + raise ValueError("Less than two years of data left after filtering") def _aggregate(self, energy_normalized, insolation): - ''' + """ Return insolation-weighted normalized PV energy and the associated aggregated insolation Parameters @@ -630,16 +702,18 @@ def _aggregate(self, energy_normalized, insolation): Insolation-weighted aggregated normalized PV energy pandas.Series Aggregated insolation - ''' + """ aggregated = aggregation.aggregation_insol( - energy_normalized, insolation, self.aggregation_freq) + energy_normalized, insolation, self.aggregation_freq + ) aggregated_insolation = insolation.resample( - self.aggregation_freq, origin='start_day').sum() + self.aggregation_freq, origin="start_day" + ).sum() return aggregated, aggregated_insolation def _yoy_degradation(self, energy_normalized, **kwargs): - ''' + """ Perform year-on-year degradation analysis on insolation-weighted aggregated energy yield. @@ -659,21 +733,22 @@ def _yoy_degradation(self, energy_normalized, **kwargs): rate confidence interval as a list 'calc_info': Dict of detailed results (see degradation.degradation_year_on_year() docs) - ''' + """ self._filter_check(energy_normalized) yoy_rd, yoy_ci, yoy_info = degradation.degradation_year_on_year( - energy_normalized, **kwargs) + energy_normalized, **kwargs + ) yoy_results = { - 'p50_rd': yoy_rd, - 'rd_confidence_interval': yoy_ci, - 'calc_info': yoy_info + "p50_rd": yoy_rd, + "rd_confidence_interval": yoy_ci, + "calc_info": yoy_info, } return yoy_results def _srr_soiling(self, energy_normalized_daily, insolation_daily, **kwargs): - ''' + """ Perform stochastic rate and recovery soiling analysis. Parameters @@ -694,120 +769,157 @@ def _srr_soiling(self, energy_normalized_daily, insolation_daily, **kwargs): insolation-weighted soiling ratio confidence interval 'calc_info' : Dict of detailed results (see soiling.soiling_srr() docs) - ''' + """ from rdtools import soiling daily_freq = pd.tseries.offsets.Day() - if (energy_normalized_daily.index.freq != daily_freq or - insolation_daily.index.freq != daily_freq): - raise ValueError( - 'Soiling SRR analysis requires daily aggregation.') + if ( + energy_normalized_daily.index.freq != daily_freq + or insolation_daily.index.freq != daily_freq + ): + raise ValueError("Soiling SRR analysis requires daily aggregation.") sr, sr_ci, soiling_info = soiling.soiling_srr( - energy_normalized_daily, insolation_daily, **kwargs) + energy_normalized_daily, insolation_daily, **kwargs + ) srr_results = { - 'p50_sratio': sr, - 'sratio_confidence_interval': sr_ci, - 'calc_info': soiling_info + "p50_sratio": sr, + "sratio_confidence_interval": sr_ci, + "calc_info": soiling_info, } return srr_results def _sensor_preprocess(self): - ''' + """ Perform sensor-based normalization, filtering, and aggregation. If optional parameter self.power_expected is passed in, normalize_with_expected_power will be used instead of pvwatts. - ''' + """ if self.poa_global is None: raise ValueError( - 'poa_global must be available to perform _sensor_preprocess') - - if 'sensor_pvlib_clearsky_filter' in self.filter_params: + "poa_global must be available to perform _sensor_preprocess" + ) + + if "sensor_pvlib_clearsky_filter" in self.filter_params: try: if self.poa_global_clearsky is None: - self._calc_clearsky_poa(model='isotropic') + self._calc_clearsky_poa(model="isotropic") except AttributeError: - raise AttributeError("No poa_global_clearsky. 'set_clearsky' must be run " + - "to allow filter_params['sensor_pvlib_clearsky_filter']. ") + raise AttributeError( + "No poa_global_clearsky. 'set_clearsky' must be run " + + "to allow filter_params['sensor_pvlib_clearsky_filter']. " + ) if self.power_expected is None: # Thermal details required if power_expected is not manually set. if self.temperature_cell is None and self.temperature_ambient is None: - raise ValueError('either cell or ambient temperature must be available ' - 'to perform _sensor_preprocess') + raise ValueError( + "either cell or ambient temperature must be available " + "to perform _sensor_preprocess" + ) if self.temperature_cell is None: self.temperature_cell = self._calc_cell_temperature( - self.poa_global, self.temperature_ambient, self.windspeed) + self.poa_global, self.temperature_ambient, self.windspeed + ) energy_normalized, insolation = self._pvwatts_norm( - self.poa_global, self.temperature_cell) + self.poa_global, self.temperature_cell + ) else: # self.power_expected passed in by user energy_normalized, insolation = normalization.normalize_with_expected_power( - self.pv_energy, self.power_expected, self.poa_global, pv_input='energy') - self._filter(energy_normalized, 'sensor') + self.pv_energy, self.power_expected, self.poa_global, pv_input="energy" + ) + self._filter(energy_normalized, "sensor") aggregated, aggregated_insolation = self._aggregate( - energy_normalized[self.sensor_filter], insolation[self.sensor_filter]) - + energy_normalized[self.sensor_filter], insolation[self.sensor_filter] + ) + # Run daily filters on aggregated data self.sensor_aggregated_insolation = aggregated_insolation - self._aggregated_filter(aggregated, 'sensor') - + self._aggregated_filter(aggregated, "sensor") + # Apply filter to aggregated data and store self.sensor_aggregated_performance = aggregated[self.sensor_filter_aggregated] - self.sensor_aggregated_insolation = aggregated_insolation[self.sensor_filter_aggregated] - + self.sensor_aggregated_insolation = aggregated_insolation[ + self.sensor_filter_aggregated + ] + # Reindex the data after the fact, so it's on the aggregated interval - self.sensor_aggregated_performance = self.sensor_aggregated_performance.resample( - self.aggregation_freq, origin='start_day').asfreq() + self.sensor_aggregated_performance = ( + self.sensor_aggregated_performance.resample( + self.aggregation_freq, origin="start_day" + ).asfreq() + ) self.sensor_aggregated_insolation = self.sensor_aggregated_insolation.resample( - self.aggregation_freq, origin='start_day').asfreq() + self.aggregation_freq, origin="start_day" + ).asfreq() def _clearsky_preprocess(self): - ''' + """ Perform clear-sky-based normalization, filtering, and aggregation. If optional parameter self.power_expected is passed in, normalize_with_expected_power will be used instead of pvwatts. - ''' + """ try: if self.poa_global_clearsky is None: - self._calc_clearsky_poa(model='isotropic') + self._calc_clearsky_poa(model="isotropic") except AttributeError: - raise AttributeError("No poa_global_clearsky. 'set_clearsky' must be run " + - "prior to 'clearsky_analysis'") + raise AttributeError( + "No poa_global_clearsky. 'set_clearsky' must be run " + + "prior to 'clearsky_analysis'" + ) if self.temperature_cell_clearsky is None: if self.temperature_ambient_clearsky is None: self._calc_clearsky_tamb() self.temperature_cell_clearsky = self._calc_cell_temperature( - self.poa_global_clearsky, self.temperature_ambient_clearsky, 0) + self.poa_global_clearsky, self.temperature_ambient_clearsky, 0 + ) # Note example notebook uses windspeed=0 in the clearskybranch if self.power_expected is None: cs_normalized, cs_insolation = self._pvwatts_norm( - self.poa_global_clearsky, self.temperature_cell_clearsky) + self.poa_global_clearsky, self.temperature_cell_clearsky + ) else: # self.power_expected passed in by user cs_normalized, cs_insolation = normalization.normalize_with_expected_power( - self.pv_energy, self.power_expected, self.poa_global_clearsky, pv_input='energy') - self._filter(cs_normalized, 'clearsky') + self.pv_energy, + self.power_expected, + self.poa_global_clearsky, + pv_input="energy", + ) + self._filter(cs_normalized, "clearsky") cs_aggregated, cs_aggregated_insolation = self._aggregate( - cs_normalized[self.clearsky_filter], cs_insolation[self.clearsky_filter]) - + cs_normalized[self.clearsky_filter], cs_insolation[self.clearsky_filter] + ) + # Run daily filters on aggregated data self.clearsky_aggregated_insolation = cs_aggregated_insolation - self._aggregated_filter(cs_aggregated, 'clearsky') - + self._aggregated_filter(cs_aggregated, "clearsky") + # Apply daily filter to aggregated data and store - self.clearsky_aggregated_performance = cs_aggregated[self.clearsky_filter_aggregated] - self.clearsky_aggregated_insolation = \ - cs_aggregated_insolation[self.clearsky_filter_aggregated] - - # Reindex the data after the fact, so it's on the aggregated interval - self.clearsky_aggregated_performance = self.clearsky_aggregated_performance.resample( - self.aggregation_freq, origin='start_day').asfreq() - self.clearsky_aggregated_insolation = self.clearsky_aggregated_insolation.resample( - self.aggregation_freq, origin='start_day').asfreq() + self.clearsky_aggregated_performance = cs_aggregated[ + self.clearsky_filter_aggregated + ] + self.clearsky_aggregated_insolation = cs_aggregated_insolation[ + self.clearsky_filter_aggregated + ] - def sensor_analysis(self, analyses=['yoy_degradation'], yoy_kwargs={}, srr_kwargs={}): - ''' + # Reindex the data after the fact, so it's on the aggregated interval + self.clearsky_aggregated_performance = ( + self.clearsky_aggregated_performance.resample( + self.aggregation_freq, origin="start_day" + ).asfreq() + ) + self.clearsky_aggregated_insolation = ( + self.clearsky_aggregated_insolation.resample( + self.aggregation_freq, origin="start_day" + ).asfreq() + ) + + def sensor_analysis( + self, analyses=["yoy_degradation"], yoy_kwargs={}, srr_kwargs={} + ): + """ Perform entire sensor-based analysis workflow. Results are stored in self.results['sensor'] @@ -824,26 +936,31 @@ def sensor_analysis(self, analyses=['yoy_degradation'], yoy_kwargs={}, srr_kwarg Returns ------- None - ''' + """ self._sensor_preprocess() sensor_results = {} - if 'yoy_degradation' in analyses: + if "yoy_degradation" in analyses: yoy_results = self._yoy_degradation( - self.sensor_aggregated_performance, **yoy_kwargs) - sensor_results['yoy_degradation'] = yoy_results - - if 'srr_soiling' in analyses: - srr_results = self._srr_soiling(self.sensor_aggregated_performance, - self.sensor_aggregated_insolation, - **srr_kwargs) - sensor_results['srr_soiling'] = srr_results - - self.results['sensor'] = sensor_results - - def clearsky_analysis(self, analyses=['yoy_degradation'], yoy_kwargs={}, srr_kwargs={}): - ''' + self.sensor_aggregated_performance, **yoy_kwargs + ) + sensor_results["yoy_degradation"] = yoy_results + + if "srr_soiling" in analyses: + srr_results = self._srr_soiling( + self.sensor_aggregated_performance, + self.sensor_aggregated_insolation, + **srr_kwargs, + ) + sensor_results["srr_soiling"] = srr_results + + self.results["sensor"] = sensor_results + + def clearsky_analysis( + self, analyses=["yoy_degradation"], yoy_kwargs={}, srr_kwargs={} + ): + """ Perform entire clear-sky-based analysis workflow. Results are stored in self.results['clearsky'] @@ -860,26 +977,29 @@ def clearsky_analysis(self, analyses=['yoy_degradation'], yoy_kwargs={}, srr_kwa Returns ------- None - ''' + """ self._clearsky_preprocess() clearsky_results = {} - if 'yoy_degradation' in analyses: + if "yoy_degradation" in analyses: yoy_results = self._yoy_degradation( - self.clearsky_aggregated_performance, **yoy_kwargs) - clearsky_results['yoy_degradation'] = yoy_results + self.clearsky_aggregated_performance, **yoy_kwargs + ) + clearsky_results["yoy_degradation"] = yoy_results - if 'srr_soiling' in analyses: - srr_results = self._srr_soiling(self.clearsky_aggregated_performance, - self.clearsky_aggregated_insolation, - **srr_kwargs) - clearsky_results['srr_soiling'] = srr_results + if "srr_soiling" in analyses: + srr_results = self._srr_soiling( + self.clearsky_aggregated_performance, + self.clearsky_aggregated_insolation, + **srr_kwargs, + ) + clearsky_results["srr_soiling"] = srr_results - self.results['clearsky'] = clearsky_results + self.results["clearsky"] = clearsky_results def plot_degradation_summary(self, case, **kwargs): - ''' + """ Return a figure of a scatter plot and a histogram summarizing degradation rate analysis. Parameters @@ -892,25 +1012,28 @@ def plot_degradation_summary(self, case, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': - results_dict = self.results['sensor']['yoy_degradation'] + if case == "sensor": + results_dict = self.results["sensor"]["yoy_degradation"] aggregated = self.sensor_aggregated_performance - elif case == 'clearsky': - results_dict = self.results['clearsky']['yoy_degradation'] + elif case == "clearsky": + results_dict = self.results["clearsky"]["yoy_degradation"] aggregated = self.clearsky_aggregated_performance else: raise ValueError("case must be either 'sensor' or 'clearsky'") fig = plotting.degradation_summary_plots( - results_dict['p50_rd'], - results_dict['rd_confidence_interval'], - results_dict['calc_info'], aggregated, **kwargs) + results_dict["p50_rd"], + results_dict["rd_confidence_interval"], + results_dict["calc_info"], + aggregated, + **kwargs, + ) return fig def plot_soiling_monte_carlo(self, case, **kwargs): - ''' + """ Return a figure visualizing the Monte Carlo of soiling profiles used in stochastic rate and recovery soiling analysis. @@ -924,24 +1047,25 @@ def plot_soiling_monte_carlo(self, case, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': - results_dict = self.results['sensor']['srr_soiling'] + if case == "sensor": + results_dict = self.results["sensor"]["srr_soiling"] aggregated = self.sensor_aggregated_performance - elif case == 'clearsky': - results_dict = self.results['clearsky']['srr_soiling'] + elif case == "clearsky": + results_dict = self.results["clearsky"]["srr_soiling"] aggregated = self.clearsky_aggregated_performance else: raise ValueError("case must be either 'sensor' or 'clearsky'") fig = plotting.soiling_monte_carlo_plot( - results_dict['calc_info'], aggregated, **kwargs) + results_dict["calc_info"], aggregated, **kwargs + ) return fig def plot_soiling_interval(self, case, **kwargs): - ''' + """ Return a figure visualizing the valid soiling intervals used in stochastic rate and recovery soiling analysis. @@ -955,24 +1079,25 @@ def plot_soiling_interval(self, case, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': - results_dict = self.results['sensor']['srr_soiling'] + if case == "sensor": + results_dict = self.results["sensor"]["srr_soiling"] aggregated = self.sensor_aggregated_performance - elif case == 'clearsky': - results_dict = self.results['clearsky']['srr_soiling'] + elif case == "clearsky": + results_dict = self.results["clearsky"]["srr_soiling"] aggregated = self.clearsky_aggregated_performance else: raise ValueError("case must be either 'sensor' or 'clearsky'") fig = plotting.soiling_interval_plot( - results_dict['calc_info'], aggregated, **kwargs) + results_dict["calc_info"], aggregated, **kwargs + ) return fig def plot_soiling_rate_histogram(self, case, **kwargs): - ''' + """ Return a histogram of soiling rates found in the stochastic rate and recovery soiling analysis @@ -986,22 +1111,21 @@ def plot_soiling_rate_histogram(self, case, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': - results_dict = self.results['sensor']['srr_soiling'] - elif case == 'clearsky': - results_dict = self.results['clearsky']['srr_soiling'] + if case == "sensor": + results_dict = self.results["sensor"]["srr_soiling"] + elif case == "clearsky": + results_dict = self.results["clearsky"]["srr_soiling"] else: raise ValueError("case must be either 'sensor' or 'clearsky'") - fig = plotting.soiling_rate_histogram( - results_dict['calc_info'], **kwargs) + fig = plotting.soiling_rate_histogram(results_dict["calc_info"], **kwargs) return fig def plot_pv_vs_irradiance(self, case, alpha=0.01, **kwargs): - ''' + """ Plot PV energy vs irradiance, useful in diagnosing things like timezone problems or transposition errors. @@ -1018,28 +1142,31 @@ def plot_pv_vs_irradiance(self, case, alpha=0.01, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': + if case == "sensor": poa = self.poa_global - elif case == 'clearsky': + elif case == "clearsky": poa = self.poa_global_clearsky else: raise ValueError("case must be either 'sensor' or 'clearsky'") - to_plot = pd.merge(pd.DataFrame(poa), pd.DataFrame( - self.pv_energy), left_index=True, right_index=True) + to_plot = pd.merge( + pd.DataFrame(poa), + pd.DataFrame(self.pv_energy), + left_index=True, + right_index=True, + ) fig, ax = plt.subplots() - ax.plot(to_plot.iloc[:, 0], to_plot.iloc[:, 1], - 'o', alpha=alpha, **kwargs) + ax.plot(to_plot.iloc[:, 0], to_plot.iloc[:, 1], "o", alpha=alpha, **kwargs) ax.set_xlim(0, 1500) - ax.set_xlabel('Irradiance (W/m$^2$)') - ax.set_ylabel('PV Energy (Wh/timestep)') + ax.set_xlabel("Irradiance (W/m$^2$)") + ax.set_ylabel("PV Energy (Wh/timestep)") return fig def plot_degradation_timeseries(self, case, rolling_days=365, **kwargs): - ''' + """ Plot resampled time series of degradation trend with time Parameters @@ -1055,12 +1182,12 @@ def plot_degradation_timeseries(self, case, rolling_days=365, **kwargs): Returns ------- matplotlib.figure.Figure - ''' + """ - if case == 'sensor': - yoy_info = self.results['sensor']['yoy_degradation']['calc_info'] - elif case == 'clearsky': - yoy_info = self.results['clearsky']['yoy_degradation']['calc_info'] + if case == "sensor": + yoy_info = self.results["sensor"]["yoy_degradation"]["calc_info"] + elif case == "clearsky": + yoy_info = self.results["clearsky"]["yoy_degradation"]["calc_info"] else: raise ValueError("case must be either 'sensor' or 'clearsky'") diff --git a/rdtools/filtering.py b/rdtools/filtering.py index 40ef08a30..08b9139db 100644 --- a/rdtools/filtering.py +++ b/rdtools/filtering.py @@ -1,4 +1,4 @@ -'''Functions for filtering and subsetting PV system data.''' +"""Functions for filtering and subsetting PV system data.""" import numpy as np import pandas as pd @@ -12,8 +12,9 @@ # Load in the XGBoost clipping model using joblib. xgboost_clipping_model = None -model_path = os.path.join(os.path.dirname(__file__), - "models", "xgboost_clipping_model.json") +model_path = os.path.join( + os.path.dirname(__file__), "models", "xgboost_clipping_model.json" +) def _load_xgboost_clipping_model(): @@ -24,9 +25,10 @@ def _load_xgboost_clipping_model(): return xgboost_clipping_model -def normalized_filter(energy_normalized, energy_normalized_low=0.01, - energy_normalized_high=None): - ''' +def normalized_filter( + energy_normalized, energy_normalized_low=0.01, energy_normalized_high=None +): + """ Select normalized yield between ``low_cutoff`` and ``high_cutoff`` Parameters @@ -43,19 +45,20 @@ def normalized_filter(energy_normalized, energy_normalized_low=0.01, pandas.Series Boolean Series of whether the given measurement is within acceptable bounds. - ''' + """ if energy_normalized_low is None: energy_normalized_low = -np.inf if energy_normalized_high is None: energy_normalized_high = np.inf - return ((energy_normalized > energy_normalized_low) & - (energy_normalized < energy_normalized_high)) + return (energy_normalized > energy_normalized_low) & ( + energy_normalized < energy_normalized_high + ) def poa_filter(poa_global, poa_global_low=200, poa_global_high=1200): - ''' + """ Filter POA irradiance readings outside acceptable measurement bounds. Parameters @@ -72,13 +75,12 @@ def poa_filter(poa_global, poa_global_low=200, poa_global_high=1200): pandas.Series Boolean Series of whether the given measurement is within acceptable bounds. - ''' + """ return (poa_global > poa_global_low) & (poa_global < poa_global_high) -def tcell_filter(temperature_cell, temperature_cell_low=-50, - temperature_cell_high=110): - ''' +def tcell_filter(temperature_cell, temperature_cell_low=-50, temperature_cell_high=110): + """ Filter temperature readings outside acceptable measurement bounds. Parameters @@ -95,13 +97,14 @@ def tcell_filter(temperature_cell, temperature_cell_low=-50, pandas.Series Boolean Series of whether the given measurement is within acceptable bounds. - ''' - return ((temperature_cell > temperature_cell_low) & - (temperature_cell < temperature_cell_high)) + """ + return (temperature_cell > temperature_cell_low) & ( + temperature_cell < temperature_cell_high + ) def csi_filter(poa_global_measured, poa_global_clearsky, threshold=0.15): - ''' + """ Filtering based on clear-sky index (csi) Parameters @@ -118,18 +121,26 @@ def csi_filter(poa_global_measured, poa_global_clearsky, threshold=0.15): pandas.Series Boolean Series of whether the clear-sky index is within the threshold around 1. - ''' + """ csi = poa_global_measured / poa_global_clearsky return (csi >= 1.0 - threshold) & (csi <= 1.0 + threshold) -def pvlib_clearsky_filter(poa_global_measured, poa_global_clearsky, - window_length=90, mean_diff=75, max_diff=75, - lower_line_length=-45, upper_line_length=80, - var_diff=0.032, slope_dev=75, - lookup_parameters=False, **kwargs): - ''' +def pvlib_clearsky_filter( + poa_global_measured, + poa_global_clearsky, + window_length=90, + mean_diff=75, + max_diff=75, + lower_line_length=-45, + upper_line_length=80, + var_diff=0.032, + slope_dev=75, + lookup_parameters=False, + **kwargs, +): + """ Filtering based on the Reno and Hansen method for clearsky filtering as implimented in pvlib. Requires a regular time series with uniform time steps. @@ -185,43 +196,62 @@ def pvlib_clearsky_filter(poa_global_measured, poa_global_clearsky, [2] D.C. Jordan and C.W. Hansen, Renewable Energy 209 pp. 393-400 (2023) - ''' + """ if lookup_parameters and poa_global_measured.index.freq: - frequencies = np.array([1,5,15,30]) - windows = np.array([50,60,90,120]) - max_diffs = np.array([60,65,75,90]) + frequencies = np.array([1, 5, 15, 30]) + windows = np.array([50, 60, 90, 120]) + max_diffs = np.array([60, 65, 75, 90]) var_diffs = np.array([0.005, 0.01, 0.032, 0.07]) - slope_devs = np.array([50,60,75,96]) + slope_devs = np.array([50, 60, 75, 96]) - windows_interp = interp1d(frequencies, windows, + windows_interp = interp1d( + frequencies, + windows, fill_value=(windows[0], windows[-1]), - bounds_error=False) - max_diffs_interp = interp1d(frequencies, max_diffs, + bounds_error=False, + ) + max_diffs_interp = interp1d( + frequencies, + max_diffs, fill_value=(max_diffs[0], max_diffs[-1]), - bounds_error=False) - var_diffs_interp = interp1d(frequencies, var_diffs, + bounds_error=False, + ) + var_diffs_interp = interp1d( + frequencies, + var_diffs, fill_value=(var_diffs[0], var_diffs[-1]), - bounds_error=False) - slope_devs_interp = interp1d(frequencies, slope_devs, + bounds_error=False, + ) + slope_devs_interp = interp1d( + frequencies, + slope_devs, fill_value=(slope_devs[0], slope_devs[-1]), - bounds_error=False) + bounds_error=False, + ) - freq_minutes = poa_global_measured.index.freq.nanos/10**9/60 + freq_minutes = poa_global_measured.index.freq.nanos / 10**9 / 60 window_length = windows_interp(freq_minutes) max_diff = max_diffs_interp(freq_minutes) var_diff = var_diffs_interp(freq_minutes) slope_dev = slope_devs_interp(freq_minutes) - - df = pd.concat([poa_global_measured, poa_global_clearsky], axis=1, join='outer') - df.columns=['measured', 'clearsky'] - - kwargs['return_components'] = False - mask = pvlib.clearsky.detect_clearsky(df['measured'], df['clearsky'], - window_length=window_length, mean_diff=mean_diff, max_diff=max_diff, - lower_line_length=lower_line_length, upper_line_length=upper_line_length, - var_diff=var_diff, slope_dev=slope_dev, **kwargs) + df = pd.concat([poa_global_measured, poa_global_clearsky], axis=1, join="outer") + df.columns = ["measured", "clearsky"] + + kwargs["return_components"] = False + mask = pvlib.clearsky.detect_clearsky( + df["measured"], + df["clearsky"], + window_length=window_length, + mean_diff=mean_diff, + max_diff=max_diff, + lower_line_length=lower_line_length, + upper_line_length=upper_line_length, + var_diff=var_diff, + slope_dev=slope_dev, + **kwargs, + ) return mask @@ -255,30 +285,30 @@ def clip_filter(power_ac, model="quantile", **kwargs): """ if isinstance(model, Number): quantile = model - warnings.warn("Function clip_filter is now a wrapper for different " - "clipping filters. To reproduce prior behavior, " - "parameters have been interpreted as model= " - f"'quantile_clip_filter', quantile={quantile}. " - "This syntax will be removed in a future version.", - rdtools._deprecation.rdtoolsDeprecationWarning) - kwargs['quantile'] = quantile - model = 'quantile' - - if (model == 'quantile'): + warnings.warn( + "Function clip_filter is now a wrapper for different " + "clipping filters. To reproduce prior behavior, " + "parameters have been interpreted as model= " + f"'quantile_clip_filter', quantile={quantile}. " + "This syntax will be removed in a future version.", + rdtools._deprecation.rdtoolsDeprecationWarning, + ) + kwargs["quantile"] = quantile + model = "quantile" + + if model == "quantile": clip_mask = quantile_clip_filter(power_ac, **kwargs) - elif model == 'xgboost': + elif model == "xgboost": clip_mask = xgboost_clip_filter(power_ac, **kwargs) - elif model == 'logic': + elif model == "logic": clip_mask = logic_clip_filter(power_ac, **kwargs) else: - raise ValueError( - "Variable model must be 'quantile', " - "'xgboost', or 'logic'.") + raise ValueError("Variable model must be 'quantile', " "'xgboost', or 'logic'.") return clip_mask def quantile_clip_filter(power_ac, quantile=0.98): - ''' + """ Filter data points likely to be affected by clipping with power or energy greater than or equal to 99% of the `quant` quantile. @@ -295,9 +325,9 @@ def quantile_clip_filter(power_ac, quantile=0.98): pandas.Series Boolean Series of whether the given measurement is below 99% of the quantile filter. - ''' + """ v = power_ac.quantile(quantile) - return (power_ac < v * 0.99) + return power_ac < v * 0.99 def _format_clipping_time_series(power_ac, mounting_type): @@ -326,38 +356,42 @@ def _format_clipping_time_series(power_ac, mounting_type): # Check that it's a Pandas series with a datetime index. # If not, raise an error. if not isinstance(power_ac.index, pd.DatetimeIndex): - raise TypeError('Must be a Pandas series with a datetime index.') + raise TypeError("Must be a Pandas series with a datetime index.") # Check if the time series is tz-aware. If not, throw a # warning. has_timezone = pd.Series(power_ac.index).apply(lambda t: t.tzinfo is not None) # Throw a warning that we're expecting time zone-localized data, # if no time zone is specified. if not has_timezone.all(): - warnings.warn("Function expects timestamps in local time. " - "For best results pass a time-zone-localized " - "time series localized to the correct local time zone.") + warnings.warn( + "Function expects timestamps in local time. " + "For best results pass a time-zone-localized " + "time series localized to the correct local time zone." + ) # Check the other input variables to ensure that they are the # correct format if (mounting_type != "single_axis_tracking") & (mounting_type != "fixed"): raise ValueError( "Variable mounting_type must be string 'single_axis_tracking' or " - "'fixed'.") + "'fixed'." + ) # Check if the datetime index is out of order. If it is, throw an # error. if not all(power_ac.sort_index().index == power_ac.index): raise IndexError( "Time series index has not been sorted. Implement the " - "sort_index() method to the time series to rerun this function.") + "sort_index() method to the time series to rerun this function." + ) # Check that there is enough data in the dataframe. Must be greater than # 10 readings. if len(power_ac) <= 10: - raise Exception('<=10 readings in the time series, cannot run filter.') + raise Exception("<=10 readings in the time series, cannot run filter.") # Get the names of the series and the datetime index - column_name = 'value' + column_name = "value" power_ac = power_ac.rename(column_name) index_name = power_ac.index.name if index_name is None: - index_name = 'datetime' + index_name = "datetime" power_ac = power_ac.rename_axis(index_name) return power_ac, power_ac.index.name @@ -380,15 +414,17 @@ def _check_data_sampling_frequency(power_ac): """ # Get the sampling frequency counts--if the sampling frequency is not # consistently >=95% the same, then throw a warning. - sampling_frequency_df = pd.DataFrame(power_ac.index.to_series() - .diff().astype('timedelta64[s]') - .value_counts())/len(power_ac) + sampling_frequency_df = pd.DataFrame( + power_ac.index.to_series().diff().astype("timedelta64[s]").value_counts() + ) / len(power_ac) sampling_frequency_df.columns = ["count"] - if (sampling_frequency_df["count"] < .95).all(): - warnings.warn("Variable sampling frequency across time series. " - "Less than 95% of the time series is sampled at the " - "same interval. This function was not tested " - "on variable frequency data--use at your own risk!") + if (sampling_frequency_df["count"] < 0.95).all(): + warnings.warn( + "Variable sampling frequency across time series. " + "Less than 95% of the time series is sampled at the " + "same interval. This function was not tested " + "on variable frequency data--use at your own risk!" + ) return @@ -418,13 +454,11 @@ def _calculate_max_rolling_range(power_ac, roll_periods): min_roll = power_ac.iloc[::-1].rolling(roll_periods).min() min_roll = min_roll.reindex(power_ac.index) # Calculate the maximum rolling range within the foward-rolling window - rolling_range_max = (max_roll - min_roll)/((max_roll + min_roll)/2)*100 + rolling_range_max = (max_roll - min_roll) / ((max_roll + min_roll) / 2) * 100 return rolling_range_max -def _apply_overall_clipping_threshold(power_ac, - clipping_mask, - clipped_power_ac): +def _apply_overall_clipping_threshold(power_ac, clipping_mask, clipped_power_ac): """ Apply an overall clipping threshold to the data. This additional logic sets an overall threshold in the dataset @@ -452,22 +486,21 @@ def _apply_overall_clipping_threshold(power_ac, periods are labeled as True and non-clipping periods are labeled as False. Has a pandas datetime index. """ - upper_bound_pdiff = abs((power_ac.quantile(.99) - - clipped_power_ac.quantile(.99)) - / ((power_ac.quantile(.99) + - clipped_power_ac.quantile(.99))/2)) - percent_clipped = len(clipped_power_ac)/len(power_ac)*100 + upper_bound_pdiff = abs( + (power_ac.quantile(0.99) - clipped_power_ac.quantile(0.99)) + / ((power_ac.quantile(0.99) + clipped_power_ac.quantile(0.99)) / 2) + ) + percent_clipped = len(clipped_power_ac) / len(power_ac) * 100 if (upper_bound_pdiff < 0.005) & (percent_clipped > 4): - max_clip = (power_ac >= power_ac.quantile(0.99)) - clipping_mask = (clipping_mask | max_clip) + max_clip = power_ac >= power_ac.quantile(0.99) + clipping_mask = clipping_mask | max_clip return clipping_mask -def logic_clip_filter(power_ac, - mounting_type='fixed', - rolling_range_max_cutoff=0.2, - roll_periods=None): - ''' +def logic_clip_filter( + power_ac, mounting_type="fixed", rolling_range_max_cutoff=0.2, roll_periods=None +): + """ This filter is a logic-based filter that is used to filter out clipping periods in AC power or energy time series. It is based on the method presented in [1]. A boolean filter is returned @@ -514,134 +547,134 @@ def logic_clip_filter(power_ac, .. [1] Perry K., Muller, M., and Anderson K. "Performance comparison of clipping detection techniques in AC power time series", 2021 IEEE 48th Photovoltaic Specialists Conference (PVSC). DOI: 10.1109/PVSC43889.2021.9518733. - ''' + """ # Throw a warning that this is still an experimental filter. (Removed for 3.0.0) - #warnings.warn("The logic-based filter is an experimental clipping filter " + # warnings.warn("The logic-based filter is an experimental clipping filter " # "that is still under development. The API, results, and " # "default behaviors may change in future releases (including " # "MINOR and PATCH). Use at your own risk!") # Format the time series - power_ac, index_name = _format_clipping_time_series(power_ac, - mounting_type) + power_ac, index_name = _format_clipping_time_series(power_ac, mounting_type) # Test if the data sampling frequency is variable, and flag it if the time # series sampling frequency is less than 95% consistent. _check_data_sampling_frequency(power_ac) # Get the sampling frequency of the time series time_series_sampling_frequency = ( - power_ac.index.to_series().diff() / pd.Timedelta('60s') + power_ac.index.to_series().diff() / pd.Timedelta("60s") ).mode()[0] # Make copies of the original inputs for the cases that the data is # changes for clipping evaluation original_time_series_sampling_frequency = time_series_sampling_frequency power_ac_copy = power_ac.copy() # Drop duplicate indices - power_ac = power_ac.reset_index().drop_duplicates( - subset=power_ac.index.name, - keep='first').set_index(power_ac.index.name) - freq_string = str(time_series_sampling_frequency) + 'T' + power_ac = ( + power_ac.reset_index() + .drop_duplicates(subset=power_ac.index.name, keep="first") + .set_index(power_ac.index.name) + ) + freq_string = str(time_series_sampling_frequency) + "T" # Set days with the majority of frozen data to null. - daily_std = power_ac.resample('D').std() / power_ac.resample('D').mean() - power_ac['daily_std'] = daily_std.reindex(index=power_ac.index, - method='ffill') - power_ac.loc[power_ac['daily_std'] < 0.1, - 'value'] = np.nan - power_ac.drop('daily_std', - axis=1, - inplace=True) - power_cleaned = power_ac['value'].copy() - power_cleaned = power_cleaned.reindex(power_ac_copy.index, - fill_value=np.nan) + daily_std = power_ac.resample("D").std() / power_ac.resample("D").mean() + power_ac["daily_std"] = daily_std.reindex(index=power_ac.index, method="ffill") + power_ac.loc[power_ac["daily_std"] < 0.1, "value"] = np.nan + power_ac.drop("daily_std", axis=1, inplace=True) + power_cleaned = power_ac["value"].copy() + power_cleaned = power_cleaned.reindex(power_ac_copy.index, fill_value=np.nan) # High frequency data (less than 10 minutes) has demonstrated # potential to have more noise than low frequency data. # Therefore, the data is resampled to a 15-minute median # before running the filter. if time_series_sampling_frequency >= 10: - power_ac = rdtools.normalization.interpolate(power_ac, - freq_string) + power_ac = rdtools.normalization.interpolate(power_ac, freq_string) else: - power_ac = power_ac.resample('15T').median() + power_ac = power_ac.resample("15T").median() time_series_sampling_frequency = 15 # If a value for roll_periods is not designated, the function uses # the current default logic to set the roll_periods value. if roll_periods is None: - if (mounting_type == "single_axis_tracking") & \ - (time_series_sampling_frequency < 30): + if (mounting_type == "single_axis_tracking") & ( + time_series_sampling_frequency < 30 + ): roll_periods = 5 else: roll_periods = 3 # Replace the lower 10% of daily data with NaN's - daily = 0.1 * power_ac.resample('D').max() - power_ac['ten_percent_daily'] = daily.reindex(index=power_ac.index, - method='ffill') - power_ac.loc[power_ac['value'] < power_ac['ten_percent_daily'], - 'value'] = np.nan - power_ac = power_ac['value'] + daily = 0.1 * power_ac.resample("D").max() + power_ac["ten_percent_daily"] = daily.reindex(index=power_ac.index, method="ffill") + power_ac.loc[power_ac["value"] < power_ac["ten_percent_daily"], "value"] = np.nan + power_ac = power_ac["value"] # Calculate the maximum rolling range for the time series. rolling_range_max = _calculate_max_rolling_range(power_ac, roll_periods) # Determine clipping values based on the maximum rolling range in # the rolling window, and the user-specified rolling range threshold - roll_clip_mask = (rolling_range_max < rolling_range_max_cutoff) + roll_clip_mask = rolling_range_max < rolling_range_max_cutoff # Set values within roll_periods values from a True instance # as True as well - clipping = (roll_clip_mask.rolling(roll_periods).sum() >= 1) + clipping = roll_clip_mask.rolling(roll_periods).sum() >= 1 # High frequency was resampled to 15-minute average data. # The following lines apply the 15-minute clipping filter to the # original 15-minute data resulting in a clipping filter on the original # data. - if (original_time_series_sampling_frequency < 10): - clipping = clipping.reindex(index=power_ac_copy.index, - method='ffill') + if original_time_series_sampling_frequency < 10: + clipping = clipping.reindex(index=power_ac_copy.index, method="ffill") # Subset the series where clipping filter == True clip_pwr = power_ac_copy[clipping] - clip_pwr = clip_pwr.reindex(index=power_ac_copy.index, - fill_value=np.nan) + clip_pwr = clip_pwr.reindex(index=power_ac_copy.index, fill_value=np.nan) # Set any values within the clipping max + clipping min threshold # as clipping. This is done specifically for capturing the noise # for high frequency data sets. - daily_mean = clip_pwr.resample('D').mean() - df_daily = daily_mean.to_frame(name='mean') - df_daily['clipping_max'] = clip_pwr.groupby(pd.Grouper(freq='D') - ).quantile(0.99) - df_daily['clipping_min'] = clip_pwr.groupby(pd.Grouper(freq='D') - ).quantile(0.075) - daily_clipping_max = df_daily['clipping_max'].reindex( - index=power_ac_copy.index, method='ffill') - daily_clipping_min = df_daily['clipping_min'].reindex( - index=power_ac_copy.index, method='ffill') + daily_mean = clip_pwr.resample("D").mean() + df_daily = daily_mean.to_frame(name="mean") + df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile(0.99) + df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile( + 0.075 + ) + daily_clipping_max = df_daily["clipping_max"].reindex( + index=power_ac_copy.index, method="ffill" + ) + daily_clipping_min = df_daily["clipping_min"].reindex( + index=power_ac_copy.index, method="ffill" + ) else: # Find the maximum and minimum power_ac level where clipping is # detected each day. - clipping = clipping.reindex(index=power_ac_copy.index, - method='ffill') + clipping = clipping.reindex(index=power_ac_copy.index, method="ffill") clip_pwr = power_ac_copy[clipping] - clip_pwr = clip_pwr.reindex(index=power_ac_copy.index, - fill_value=np.nan) - daily_clipping_max = clip_pwr.resample('D').max() - daily_clipping_min = clip_pwr.resample('D').min() + clip_pwr = clip_pwr.reindex(index=power_ac_copy.index, fill_value=np.nan) + daily_clipping_max = clip_pwr.resample("D").max() + daily_clipping_min = clip_pwr.resample("D").min() daily_clipping_min = daily_clipping_min.reindex( - index=power_ac_copy.index, method='ffill') + index=power_ac_copy.index, method="ffill" + ) daily_clipping_max = daily_clipping_max.reindex( - index=power_ac_copy.index, method='ffill') + index=power_ac_copy.index, method="ffill" + ) # Set all values to clipping that are between the maximum and minimum # power_ac levels where clipping was found on a daily basis. - clipping_difference = (daily_clipping_max - - daily_clipping_min)/daily_clipping_max - final_clip = ((daily_clipping_min <= power_ac_copy) & - (power_ac_copy <= daily_clipping_max) & - (clipping_difference <= 0.025)) \ - | ((power_ac_copy <= daily_clipping_max*1.0025) & - (power_ac_copy >= daily_clipping_max*0.9975) & - (clipping_difference > 0.025))\ - | ((power_ac_copy <= daily_clipping_min*1.0025) & - (power_ac_copy >= daily_clipping_min*0.9975) & - (clipping_difference > 0.025)) - final_clip = final_clip.reindex(index=power_ac_copy.index, - fill_value=False) + clipping_difference = (daily_clipping_max - daily_clipping_min) / daily_clipping_max + final_clip = ( + ( + (daily_clipping_min <= power_ac_copy) + & (power_ac_copy <= daily_clipping_max) + & (clipping_difference <= 0.025) + ) + | ( + (power_ac_copy <= daily_clipping_max * 1.0025) + & (power_ac_copy >= daily_clipping_max * 0.9975) + & (clipping_difference > 0.025) + ) + | ( + (power_ac_copy <= daily_clipping_min * 1.0025) + & (power_ac_copy >= daily_clipping_min * 0.9975) + & (clipping_difference > 0.025) + ) + ) + final_clip = final_clip.reindex(index=power_ac_copy.index, fill_value=False) # Check for an overall clipping threshold that should apply to all data clip_power_ac = power_ac_copy[final_clip] - final_clip = _apply_overall_clipping_threshold(power_cleaned, - final_clip, - clip_power_ac) + final_clip = _apply_overall_clipping_threshold( + power_cleaned, final_clip, clip_power_ac + ) return ~final_clip @@ -664,54 +697,65 @@ def _calculate_xgboost_model_features(df, sampling_frequency): model. """ # Min-max normalize - max_min_diff = (df['value'].max() - df['value'].min()) - df['scaled_value'] = (df['value'] - df['value'].min()) / max_min_diff + max_min_diff = df["value"].max() - df["value"].min() + df["scaled_value"] = (df["value"] - df["value"].min()) / max_min_diff if sampling_frequency < 10: rolling_window = 5 elif (sampling_frequency >= 10) and (sampling_frequency < 60): rolling_window = 3 else: rolling_window = 2 - df['rolling_average'] = df['scaled_value']\ - .rolling(window=rolling_window, center=True).mean() + df["rolling_average"] = ( + df["scaled_value"].rolling(window=rolling_window, center=True).mean() + ) # First-order derivative - df['first_order_derivative_backward'] = df.scaled_value.diff() - df['first_order_derivative_forward'] = df.scaled_value.shift(-1).diff() + df["first_order_derivative_backward"] = df.scaled_value.diff() + df["first_order_derivative_forward"] = df.scaled_value.shift(-1).diff() # First order derivative for the rolling average - df['first_order_derivative_backward_rolling_avg'] = \ - df.rolling_average.diff() - df['first_order_derivative_forward_rolling_avg'] = \ - df.rolling_average.shift(-1).diff() + df["first_order_derivative_backward_rolling_avg"] = df.rolling_average.diff() + df["first_order_derivative_forward_rolling_avg"] = df.rolling_average.shift( + -1 + ).diff() # Calculate the maximum rolling range for the power or energy time series. - df['deriv_max'] = _calculate_max_rolling_range( - power_ac=df['scaled_value'], roll_periods=rolling_window) + df["deriv_max"] = _calculate_max_rolling_range( + power_ac=df["scaled_value"], roll_periods=rolling_window + ) # Get the max value for the day and see how each value compares - df['date'] = list(pd.to_datetime(pd.Series(df.index)).dt.date) - df['daily_max'] = df.groupby(['date'])['scaled_value'].transform(max) + df["date"] = list(pd.to_datetime(pd.Series(df.index)).dt.date) + df["daily_max"] = df.groupby(["date"])["scaled_value"].transform(max) # Get percentage of daily max - df['percent_daily_max'] = df['scaled_value'] / (df['daily_max'] + .00001) + df["percent_daily_max"] = df["scaled_value"] / (df["daily_max"] + 0.00001) # Get the standard deviation, median and mean of the first order # derivative over the rolling_window period - df['deriv_backward_rolling_stdev'] = \ - df['first_order_derivative_backward']\ - .rolling(window=rolling_window, center=True).std() - df['deriv_backward_rolling_mean'] = \ - df['first_order_derivative_backward']\ - .rolling(window=rolling_window, center=True).mean() - df['deriv_backward_rolling_median'] = \ - df['first_order_derivative_backward']\ - .rolling(window=rolling_window, center=True).median() - df['deriv_backward_rolling_max'] = \ - df['first_order_derivative_backward']\ - .rolling(window=rolling_window, center=True).max() - df['deriv_backward_rolling_min'] = \ - df['first_order_derivative_backward']\ - .rolling(window=rolling_window, center=True).min() + df["deriv_backward_rolling_stdev"] = ( + df["first_order_derivative_backward"] + .rolling(window=rolling_window, center=True) + .std() + ) + df["deriv_backward_rolling_mean"] = ( + df["first_order_derivative_backward"] + .rolling(window=rolling_window, center=True) + .mean() + ) + df["deriv_backward_rolling_median"] = ( + df["first_order_derivative_backward"] + .rolling(window=rolling_window, center=True) + .median() + ) + df["deriv_backward_rolling_max"] = ( + df["first_order_derivative_backward"] + .rolling(window=rolling_window, center=True) + .max() + ) + df["deriv_backward_rolling_min"] = ( + df["first_order_derivative_backward"] + .rolling(window=rolling_window, center=True) + .min() + ) return df -def xgboost_clip_filter(power_ac, - mounting_type='fixed'): +def xgboost_clip_filter(power_ac, mounting_type="fixed"): """ This function generates the features to run through the XGBoost clipping model, runs the data through the model, and generates @@ -742,113 +786,131 @@ def xgboost_clip_filter(power_ac, Specialists Conference (PVSC). DOI: 10.1109/PVSC43889.2021.9518733. """ # Throw a warning that this is still an experimental filter - warnings.warn("The XGBoost filter is an experimental clipping filter " - "that is still under development. The API, results, and " - "default behaviors may change in future releases (including " - "MINOR and PATCH). Use at your own risk!") + warnings.warn( + "The XGBoost filter is an experimental clipping filter " + "that is still under development. The API, results, and " + "default behaviors may change in future releases (including " + "MINOR and PATCH). Use at your own risk!" + ) # Load in the XGBoost model xgboost_clipping_model = _load_xgboost_clipping_model() # Format the power or energy time series - power_ac, index_name = _format_clipping_time_series(power_ac, - mounting_type) + power_ac, index_name = _format_clipping_time_series(power_ac, mounting_type) # Test if the data sampling frequency is variable, and flag it if the time # series sampling frequency is less than 95% consistent. _check_data_sampling_frequency(power_ac) # Get the most common sampling frequency - sampling_frequency = int((power_ac.index.to_series().diff() / pd.Timedelta('60s')).mode()[0]) + sampling_frequency = int( + (power_ac.index.to_series().diff() / pd.Timedelta("60s")).mode()[0] + ) freq_string = str(sampling_frequency) + "T" # Min-max normalize # Resample the series based on the most common sampling frequency - power_ac_interpolated = rdtools.normalization.interpolate(power_ac, - freq_string) + power_ac_interpolated = rdtools.normalization.interpolate(power_ac, freq_string) # Convert the Pandas series to a dataframe. power_ac_df = power_ac_interpolated.to_frame() # Get the sampling frequency (as a continuous feature variable) - power_ac_df['sampling_frequency'] = sampling_frequency + power_ac_df["sampling_frequency"] = sampling_frequency # If the data sampling frequency of the series is more frequent than # once every five minute, resample at 5-minute intervals before # plugging into the model if sampling_frequency < 5: - power_ac_df = power_ac_df.resample('5T').mean() - power_ac_df['sampling_frequency'] = 5 + power_ac_df = power_ac_df.resample("5T").mean() + power_ac_df["sampling_frequency"] = 5 # Add mounting type as a column - power_ac_df['mounting_config'] = mounting_type + power_ac_df["mounting_config"] = mounting_type # Generate the features for the model. - power_ac_df = _calculate_xgboost_model_features(power_ac_df, - sampling_frequency) + power_ac_df = _calculate_xgboost_model_features(power_ac_df, sampling_frequency) # Convert single-axis tracking/fixed tilt to a boolean variable - power_ac_df.loc[power_ac_df['mounting_config'] == "single_axis_tracking", - 'mounting_config_bool'] = 1 - power_ac_df.loc[power_ac_df['mounting_config'] == 'fixed', - 'mounting_config_bool'] = 0 + power_ac_df.loc[ + power_ac_df["mounting_config"] == "single_axis_tracking", "mounting_config_bool" + ] = 1 + power_ac_df.loc[ + power_ac_df["mounting_config"] == "fixed", "mounting_config_bool" + ] = 0 # Subset the dataframe to only include model inputs - power_ac_df = power_ac_df[['first_order_derivative_backward', - 'first_order_derivative_forward', - 'first_order_derivative_backward_rolling_avg', - 'first_order_derivative_forward_rolling_avg', - 'sampling_frequency', - 'mounting_config_bool', 'scaled_value', - 'rolling_average', 'daily_max', - 'percent_daily_max', 'deriv_max', - 'deriv_backward_rolling_stdev', - 'deriv_backward_rolling_mean', - 'deriv_backward_rolling_median', - 'deriv_backward_rolling_min', - 'deriv_backward_rolling_max']].dropna() + power_ac_df = power_ac_df[ + [ + "first_order_derivative_backward", + "first_order_derivative_forward", + "first_order_derivative_backward_rolling_avg", + "first_order_derivative_forward_rolling_avg", + "sampling_frequency", + "mounting_config_bool", + "scaled_value", + "rolling_average", + "daily_max", + "percent_daily_max", + "deriv_max", + "deriv_backward_rolling_stdev", + "deriv_backward_rolling_mean", + "deriv_backward_rolling_median", + "deriv_backward_rolling_min", + "deriv_backward_rolling_max", + ] + ].dropna() # Run the power_ac_df dataframe through the XGBoost ML model, # and return boolean outputs - xgb_predictions = pd.Series(xgboost_clipping_model.predict( - power_ac_df).astype(bool)) + xgb_predictions = pd.Series( + xgboost_clipping_model.predict(power_ac_df).astype(bool) + ) # Add datetime as an index xgb_predictions.index = power_ac_df.index # Reindex with the original data index. Re-adjusts to original # data frequency. - xgb_predictions = xgb_predictions.reindex(index=power_ac.index, - method='ffill') + xgb_predictions = xgb_predictions.reindex(index=power_ac.index, method="ffill") xgb_predictions = xgb_predictions.fillna(False) # Regenerate the features with the original sampling frequency # (pre-resampling or interpolation). power_ac_df = power_ac.to_frame() - power_ac_df = _calculate_xgboost_model_features(power_ac_df, - sampling_frequency) + power_ac_df = _calculate_xgboost_model_features(power_ac_df, sampling_frequency) # Add back in XGB predictions for the original dataframe - power_ac_df['xgb_predictions'] = xgb_predictions.astype(bool) - power_ac_df_clipping = power_ac_df[power_ac_df['xgb_predictions'] - .fillna(False)] + power_ac_df["xgb_predictions"] = xgb_predictions.astype(bool) + power_ac_df_clipping = power_ac_df[power_ac_df["xgb_predictions"].fillna(False)] # Make everything between the # max and min values found for clipping each day as clipping. - power_ac_df_clipping_max = power_ac_df_clipping['scaled_value']\ - .resample('D').max() - power_ac_df_clipping_min = power_ac_df_clipping['scaled_value']\ - .resample('D').min() - power_ac_df['daily_clipping_min'] = power_ac_df_clipping_min.reindex( - index=power_ac_df.index, method='ffill') - power_ac_df['daily_clipping_max'] = power_ac_df_clipping_max.reindex( - index=power_ac_df.index, method='ffill') + power_ac_df_clipping_max = power_ac_df_clipping["scaled_value"].resample("D").max() + power_ac_df_clipping_min = power_ac_df_clipping["scaled_value"].resample("D").min() + power_ac_df["daily_clipping_min"] = power_ac_df_clipping_min.reindex( + index=power_ac_df.index, method="ffill" + ) + power_ac_df["daily_clipping_max"] = power_ac_df_clipping_max.reindex( + index=power_ac_df.index, method="ffill" + ) if sampling_frequency < 5: - power_ac_df['daily_clipping_max_threshold'] = \ - (power_ac_df['daily_clipping_max'] * .96) - power_ac_df['clipping cutoff'] = \ - power_ac_df[['daily_clipping_min', - 'daily_clipping_max_threshold']].max(axis=1) - final_clip = ((power_ac_df['clipping cutoff'] <= - power_ac_df['scaled_value']) - & (power_ac_df['percent_daily_max'] >= .9) - & (power_ac_df['scaled_value'] <= - power_ac_df['daily_clipping_max'] * 1.0025) - & (power_ac_df['scaled_value'] >= .1)) + power_ac_df["daily_clipping_max_threshold"] = ( + power_ac_df["daily_clipping_max"] * 0.96 + ) + power_ac_df["clipping cutoff"] = power_ac_df[ + ["daily_clipping_min", "daily_clipping_max_threshold"] + ].max(axis=1) + final_clip = ( + (power_ac_df["clipping cutoff"] <= power_ac_df["scaled_value"]) + & (power_ac_df["percent_daily_max"] >= 0.9) + & ( + power_ac_df["scaled_value"] + <= power_ac_df["daily_clipping_max"] * 1.0025 + ) + & (power_ac_df["scaled_value"] >= 0.1) + ) else: - final_clip = ((power_ac_df['daily_clipping_min'] <= - power_ac_df['scaled_value']) - & (power_ac_df['percent_daily_max'] >= .95) - & (power_ac_df['scaled_value'] <= - power_ac_df['daily_clipping_max'] * 1.0025) - & (power_ac_df['scaled_value'] >= .1)) + final_clip = ( + (power_ac_df["daily_clipping_min"] <= power_ac_df["scaled_value"]) + & (power_ac_df["percent_daily_max"] >= 0.95) + & ( + power_ac_df["scaled_value"] + <= power_ac_df["daily_clipping_max"] * 1.0025 + ) + & (power_ac_df["scaled_value"] >= 0.1) + ) final_clip = final_clip.reindex(index=power_ac.index, fill_value=False) return ~(final_clip.astype(bool)) -def two_way_window_filter(series, roll_period=pd.to_timedelta('7 Days'), outlier_threshold=0.03): - ''' + +def two_way_window_filter( + series, roll_period=pd.to_timedelta("7 Days"), outlier_threshold=0.03 +): + """ Removes outliers based on forward and backward window of the rolling median. Points beyond outlier_threshold from both the forward and backward-looking median are excluded by the filter. @@ -860,40 +922,43 @@ def two_way_window_filter(series, roll_period=pd.to_timedelta('7 Days'), outlier The window to use for backward and forward rolling medians for detecting outliers. outlier_threshold : default is 0.03 meaning 3% - ''' + """ - series = series/series.quantile(0.99) - backward_median = series.rolling(roll_period, min_periods=5, closed='both').median() - forward_median = series.loc[::-1].rolling(roll_period, min_periods=5, closed='both').median() + series = series / series.quantile(0.99) + backward_median = series.rolling(roll_period, min_periods=5, closed="both").median() + forward_median = ( + series.loc[::-1].rolling(roll_period, min_periods=5, closed="both").median() + ) - backward_dif = abs(series-backward_median) - forward_dif = abs(series-forward_median) + backward_dif = abs(series - backward_median) + forward_dif = abs(series - forward_median) # This is a change from Matt's original logic, which can exclude # points with a NaN median backward_dif.fillna(0, inplace=True) forward_dif.fillna(0, inplace=True) - dif_min=backward_dif.combine(forward_dif,min,0) + dif_min = backward_dif.combine(forward_dif, min, 0) + + mask = dif_min < outlier_threshold - mask=dif_min= limit return mask -def hampel_filter(vals, k='14d', t0=3): - ''' + +def hampel_filter(vals, k="14d", t0=3): + """ Hampel outlier filter primarily applied on daily normalized data but broadly applicable. Parameters @@ -910,54 +975,55 @@ def hampel_filter(vals, k='14d', t0=3): pandas.Series Boolean Series of whether the given measurement is within 3 sigma of the median. False points indicate outliers to be removed. - ''' + """ # Hampel Filter L = 1.4826 rolling_median = vals.rolling(k, center=True, min_periods=1).median() - difference = np.abs(rolling_median-vals) + difference = np.abs(rolling_median - vals) median_abs_deviation = difference.rolling(k, center=True, min_periods=1).median() threshold = t0 * L * median_abs_deviation return difference <= threshold def _tukey_fence(series, k=1.5): - 'Calculates the upper and lower tukey fences from a pandas series' + "Calculates the upper and lower tukey fences from a pandas series" p25 = series.quantile(0.25) p75 = series.quantile(0.75) iqr = p75 - p25 - upper_fence = k*iqr + p75 - lower_fence = p25 - 1.5*iqr + upper_fence = k * iqr + p75 + lower_fence = p25 - 1.5 * iqr return lower_fence, upper_fence -def directional_tukey_filter(series, roll_period=pd.to_timedelta('7 Days'), k=1.5): - ''' +def directional_tukey_filter(series, roll_period=pd.to_timedelta("7 Days"), k=1.5): + """ Performs a forward and backward looking rolling tukey filter. Points must only pass one of either the forward or backward looking filters to be kept - ''' - backward_median = series.rolling(roll_period, min_periods=5, closed='both').median() - forward_median = series.loc[::-1].rolling(roll_period, min_periods=5, closed='both').median() + """ + backward_median = series.rolling(roll_period, min_periods=5, closed="both").median() + forward_median = ( + series.loc[::-1].rolling(roll_period, min_periods=5, closed="both").median() + ) backward_dif = series - backward_median forward_dif = series - forward_median backward_dif_lower, backward_dif_upper = _tukey_fence(backward_dif, k) forward_dif_lower, forward_dif_upper = _tukey_fence(forward_dif, k) - mask = ( - ((forward_dif > forward_dif_lower) & (forward_dif < forward_dif_upper)) | - ((backward_dif > backward_dif_lower) & (backward_dif < backward_dif_upper)) - ) + mask = ((forward_dif > forward_dif_lower) & (forward_dif < forward_dif_upper)) | ( + (backward_dif > backward_dif_lower) & (backward_dif < backward_dif_upper) + ) return mask def hour_angle_filter(series, lat, lon, min_hour_angle=-30, max_hour_angle=30): - ''' + """ Creates a filter based on the hour angle of the sun (15 degrees per hour) - ''' + """ times = series.index spa = pvlib.solarposition.get_solarposition(times, lat, lon) - eot = spa['equation_of_time'] + eot = spa["equation_of_time"] hour_angle = pvlib.solarposition.hour_angle(times, lon, eot) hour_angle = pd.Series(hour_angle, index=times) mask = (hour_angle >= min_hour_angle) & (hour_angle <= max_hour_angle)