-
Notifications
You must be signed in to change notification settings - Fork 0
/
daytime.py
362 lines (328 loc) · 14.9 KB
/
daytime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""Functions for identifying daytime"""
import numpy as np
import pandas as pd
from pvanalytics import util
from pandas.tseries.frequencies import to_offset
def _rolling_by_minute(data, days, f):
# apply `f` to a rolling window of length `days` at each minute of
# the day.
rolling = data.groupby(
data.index.hour * 60 + data.index.minute
).rolling(
min_periods=1,
center=True,
window=days
)
result = f(rolling).reset_index(0, drop=True)
return result.sort_index()
def _run_lengths(series):
# Count the number of equal values adjacent to each value.
#
# Examples
# --------
# >>> _run_lengths(pd.Series([True, True, True]))
# 0 3
# 1 3
# 2 3
#
# >>> _run_lengths(
# ... pd.Series([True, False, False, True, True, False]
# ... ))
# 0 1
# 1 2
# 2 2
# 3 2
# 4 2
# 5 1
runs = (series != series.shift(1)).cumsum()
return runs.groupby(runs).transform('count')
def _correct_if_invalid(series, invalid, correction_window):
# For every time marked `invalid` replace the truth value in `series`
# with the truth value at the same time in the majority of the
# surrounding days. The number of surrounding days to examine is indicated
# by `correction_window`.
rolling_majority = _rolling_by_minute(
series,
days=correction_window,
f=lambda x: x.sum() / x.count() > 0.5
)
return (~invalid & series) | (invalid & rolling_majority)
def _correct_midday_errors(night, minutes_per_value, hours_min,
correction_window):
# identify periods of time that appear to switch from night to day
# (or day to night) on too short a time scale to be reasonable.
invalid = _run_lengths(night)*minutes_per_value <= hours_min*60
return _correct_if_invalid(night, invalid, correction_window)
def _correct_edge_of_day_errors(night, minutes_per_value,
day_length_difference_max,
day_length_window, correction_window):
# Identify day-time periods that are "too short" and replace
# values with the majority truth-value for the same time in the
# surrounding days.
#
# Because daylight savings shifts happen at night we cannot look
# at night-length directly. Instead we look for too-short days and
# flag the full day for correction. This may result in slightly
# reduced accuracy for sunrise/sunset times on these days (even if
# the day/night boundary at one end of the day - sunrise or sunset
# - was correctly marked, it will be replaced with the rolling
# median for that minute).
day_periods = (~night).astype(int)
day_length = (1 + day_periods.groupby(
night.cumsum()).transform('sum')) * minutes_per_value
# remove night time values so they don't interfere with the median
# day length.
day_length.loc[night] = np.nan
day_length_median = day_length.rolling(
window=str(day_length_window) + 'D'
).median()
# flag days that are more than 30 minutes shorter than the median
short_days = day_length < (day_length_median - day_length_difference_max)
invalid = short_days.groupby(short_days.index.date).transform(
lambda day: any(day)
)
return _correct_if_invalid(night, invalid, correction_window)
def _ffill_short_periods(night, minutes_per_value, hours_min):
# identify periods of time that appear to switch from night to day
# (or day to night) on too short a time scale to be reasonable.
invalid = _run_lengths(night)*minutes_per_value <= hours_min*60
# Throw out anything on the first or last 2 day period, as only part of
# this period may be represented. 2 days is used here as the first or
# last day may only have one timestamp present (which is not uncommon)
invalid.loc[invalid.index.date <= (invalid.index.date.min() +
pd.Timedelta(days=1))] = False
invalid.loc[invalid.index.date >= (invalid.index.date.max() -
pd.Timedelta(days=1))] = False
# Set those invalid periods to NaN, and then forward fill them.
# This is a final step for picking up any cases that weren't caught
# in _correct_midday_errors() or _correct_edge_of_day_errors()
night[invalid] = np.nan
return night.ffill()
def _filter_and_normalize(series, outliers):
# filter a series by removing outliers and clamping the minimum to
# 0. Then normalize the series by the maximum deviation.
if outliers is not None:
series.loc[outliers] = np.nan
series.loc[series < 0] = 0
return (series - series.min()) / (series.max() - series.min())
def _freqstr_to_minutes(freqstr):
return util.freq_to_timedelta(freqstr).seconds / 60
def power_or_irradiance(series, outliers=None,
low_value_threshold=0.003,
low_median_threshold=0.0015,
low_diff_threshold=0.0005, median_days=7,
clipping=None, freq=None,
correction_window=31, hours_min=5,
day_length_difference_max=30,
day_length_window=14,
nullify_repeat_count = None):
"""Return True for values that are during the day.
After removing outliers and normalizing the data, a time is
classified as night when two of the following three criteria are
satisfied:
- near-zero value
- near-zero first-order derivative
- near-zero rolling median at the same time over the surrounding
week (see `median_days`)
Mid-day times where power goes near zero or
stops changing may be incorrectly classified as night. To correct
these errors, night or day periods with duration that is too long or
too short are identified, and times in these periods are re-classified
to have the majority value at the same time on preceding and
following days (as set by `correction_window`).
Finally any values that are True in `clipping` are marked as day.
Parameters
----------
series : Series
Time series of power or irradiance.
outliers : Series, optional
Boolean time series with True for values in `series` that are
outliers.
low_value_threshold : float, default 0.003
Maximum normalized power or irradiance value for a time to be
considered night.
low_median_threshold : float, default 0.0015
Maximum rolling median of power or irradiance for a time to be
considered night.
low_diff_threshold : float, default 0.0005
Maximum derivative of normalized power or irradiance for a time
to be considered night.
median_days : int, default 7
Number of days to use to calculate the rolling median at each
minute. [days]
clipping : Series, optional
True when clipping indicated. Any values where clipping is
indicated are automatically considered 'daytime'.
freq : str, optional
A pandas freqstr specifying the expected timestamp spacing for
the series. If None, the frequency will be inferred from the index.
correction_window : int, default 31
Number of adjacent days to examine when correcting
day/night classification errors. [days]
hours_min : float, default 5
Minimum number of hours in a contiguous period of day or
night. A day/night period shorter than `hours_min` is
flagged for error correction. [hours]
day_length_difference_max : float, default 30
Days with length that is `day_length_difference_max` minutes less
than the median length of surrounding days are flagged for
corrections.
day_length_window : int, default 14
The length of the rolling window used for calculating the
median length of the day when correcting errors in the morning
or afternoon. [days]
Returns
-------
Series
Boolean time series with True for times that are during the
day.
Notes
-----
``NA`` values are treated like zeros.
Derived from the PVFleets QA Analysis project.
"""
series = series.fillna(value=0)
series_norm = _filter_and_normalize(series, outliers).fillna(value=0)
minutes_per_value = _freqstr_to_minutes(
freq or pd.infer_freq(series.index)
)
first_order_diff = series_norm.diff() / minutes_per_value
rolling_median = _rolling_by_minute(
series_norm,
days=median_days,
f=pd.core.window.RollingGroupby.median
)
# Night-time if two of the following are satisfied:
# - Near-zero value
# - Near-zero first-order derivative
# - Near-zero rolling median
low_value = series_norm <= low_value_threshold
low_median = rolling_median <= low_median_threshold
low_diff = abs(first_order_diff) <= low_diff_threshold
night = ((low_value & low_diff)
| (low_value & low_median)
| (low_diff & low_median))
# Fix erroneous classifications (e.g. midday outages where power
# goes to 0 and stays there for several hours, clipping classified
# as night, and night-time periods that are too long)
night_corrected_midday = _correct_midday_errors(
night, minutes_per_value, hours_min, correction_window
)
night_corrected_clipping = ~((clipping or False)
| (~night_corrected_midday))
night_corrected_edges = _correct_edge_of_day_errors(
night_corrected_clipping,
minutes_per_value,
day_length_difference_max,
day_length_window,
correction_window
)
return ~night_corrected_edges
def get_sunrise(daytime_mask, freq=None, data_alignment='L'):
"""
Using the outputs of power_or_irradiance(), derive sunrise values for
each day in the associated time series.
Parameters
----------
daytime_mask_series : Series
Boolean series delineating night periods from day periods, where
day is True and night is False.
freq : str, optional
A pandas freqstr specifying the expected timestamp spacing for
the series. If None, the frequency will be inferred from the index.
data_alignment : String, default 'L'
The data alignment of the series (left-aligned or right-aligned). Data
alignment affects the value selected as sunrise. Options are 'L' (left-
aligned), 'R' (right-aligned), or 'C' (center-aligned)
Returns
-------
Series
Series of daily sunrise times, based on the daytime_mask series.
This series has the same index as the passed daytime_mask series.
"""
# Get the first day period for each day
sunrise_series = daytime_mask.index[daytime_mask].to_series().groupby(
daytime_mask[daytime_mask].index.date).transform('first').reindex(
daytime_mask.index)
sunrise_series = sunrise_series.groupby(
sunrise_series.index.date).ffill().bfill()
# Backfilling and front filling fills all NaN's, so we set cases not in
# the right day to NaN
sunrise_series.loc[sunrise_series.index.date !=
sunrise_series.dt.date] = np.nan
# If there's no frequency value, infer it from the daytime_mask series
if not freq:
freq = pd.infer_freq(daytime_mask.index)
# For left-aligned data, we want the first 'day' mask for
# each day in the series; this will act as a proxy for sunrise.
# Because of this, we will just return the sunrise_series with
# no modifications
if data_alignment == 'L':
return sunrise_series
# For center-aligned data, we want the mid-point between the last night
# mask and the first day mask. To do this, we subtract freq / 2 from
# each sunrise time in the sunrise_series.
elif data_alignment == 'C':
return (sunrise_series - (to_offset(freq) / 2))
# For right-aligned data, get the last nighttime mask datetime
# before the first 'day' mask in the series. To do this, we subtract freq
# from each sunrise time in the sunrise_series.
elif data_alignment == 'R':
return (sunrise_series - to_offset(freq))
else:
# Throw an error if right,left, or center-alignment are not declared
raise ValueError("No valid data alignment given. Please pass 'L'"
" for left-aligned data, 'R' for right-aligned data,"
" or 'C' for center-aligned data.")
def get_sunset(daytime_mask, freq=None, data_alignment='L'):
"""
Using the outputs of power_or_irradiance(), derive sunset values for
each day in the associated time series.
Parameters
----------
daytime_mask : Series
Boolean series delineating night periods from day periods, where
day is True and night is False.
freq : str, optional
A pandas freqstr specifying the expected timestamp spacing for
the series. If None, the frequency will be inferred from the index.
data_alignment : String, default 'L'
The data alignment of the series (left-aligned or right-aligned). Data
alignment affects the value selected as sunrise. Options are 'L' (left-
aligned), 'R' (right-aligned), or 'C' (center-aligned)
Returns
-------
Series
Series of daily sunset times, based on the daytime_mask series.
This series has the same index as the passed daytime_mask series.
"""
sunset_series = daytime_mask.index[daytime_mask].to_series().groupby(
daytime_mask[daytime_mask].index.date).transform('last').reindex(
daytime_mask.index)
sunset_series = sunset_series.groupby(
sunset_series.index.date).ffill().bfill()
# Backfilling and front filling fills all NaN's, so we set cases not in
# the right day to NaN
sunset_series.loc[sunset_series.index.date !=
sunset_series.dt.date] = np.nan
# If there's no frequency value, infer it from the daytime_mask series
if not freq:
freq = pd.infer_freq(daytime_mask.index)
# For left-aligned data, sunset is the first nighttime period
# after the day mask. To get this, we add freq to each sunset time in
# the sunset time series.
if data_alignment == 'L':
return (sunset_series + to_offset(freq))
# For center-aligned data, sunset is the midpoint between the last day
# mask and the first nighttime mask. We calculate this by adding (freq / 2)
# to each sunset time in the sunset_series.
elif data_alignment == 'C':
return (sunset_series + (to_offset(freq) / 2))
# For right-aligned data, the last 'day' mask time stamp is sunset.
elif data_alignment == 'R':
return sunset_series
else:
# Throw an error if right, left, or center-alignment are not declared
raise ValueError("No valid data alignment given. Please pass 'L'"
" for left-aligned data, 'R' for right-aligned data,"
" or 'C' for center-aligned data.")