-
Notifications
You must be signed in to change notification settings - Fork 0
/
road_friction_forecasting.py
449 lines (315 loc) · 15.4 KB
/
road_friction_forecasting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
# -*- coding: utf-8 -*-
"""Road-Friction-Forecasting.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1W15eOQbeHp9wbJWRaE0f7ZfYv_jAj14O
# Authors:
**Md. Abrar Jahin**
* LinkedIn: https://www.linkedin.com/in/md-abrar-jahin-9a026018b
* Facebook: https://www.facebook.com/
* Github: https://github.com/Abrar2652
* email: [email protected]
**Andrii Krutsylo**
* Website: https://krutsylo.neocities.org
* email: [email protected]
# Import Libraries and Packages
After importing libraries and packages, we start off by defining a function `transform_to_supervised` that creates desired **lag** *(24 hours in this case)* and **forecasting** features *(1 hour)* of our independent variables concatening with the dataframe and returns the final dataframe.
"""
import os
import optuna
import pickle
import pandas as pd
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier, XGBRegressor
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, accuracy_score, balanced_accuracy_score
import numpy as np
def transform_to_supervised(df,
previous_steps=1,
forecast_steps=1,
dropnan=False):
"""
https://gist.github.com/monocongo/6e0df19c9dd845f3f465a9a6ccfcef37
Transforms a DataFrame containing time series data into a DataFrame
containing data suitable for use as a supervised learning problem.
Derived from code originally found at
https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
:param df: pandas DataFrame object containing columns of time series values
:param previous_steps: the number of previous steps that will be included in the
output DataFrame corresponding to each input column
:param forecast_steps: the number of forecast steps that will be included in the
output DataFrame corresponding to each input column
:return Pandas DataFrame containing original columns, renamed <orig_name>(t), as well as
columns for previous steps, <orig_name>(t-1) ... <orig_name>(t-n) and columns
for forecast steps, <orig_name>(t+1) ... <orig_name>(t+n)
"""
# original column names
col_names = df.columns
# list of columns and corresponding names we'll build from
# the originals found in the input DataFrame
cols, names = list(), list()
# input sequence (t-n, ... t-1)
# Lag features
for i in range(previous_steps, 0, -1):
cols.append(df.shift(i))
names += [('%s(t-%d)' % (col_name, i)) for col_name in col_names]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, forecast_steps):
cols.append(df.shift(-i))
if i == 0:
names += [('%s(t)' % col_name) for col_name in col_names]
else:
names += [('%s(t+%d)' % (col_name, i)) for col_name in col_names]
# put all the columns together into a single aggregated DataFrame
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
"""# Data Collection and Import data
The dataset has been collected from the **Smart Road - Winter Road Maintenance Challenge 2021** organized by *UiT The Arctic University of Norway* on Devpost.
Dataset download link: https://uitno.app.box.com/s/bch09z27weq0wpcv8dbbc18sxz6cycjt
After downloading the `smart_road_measurements.csv` file from the competition page, wehad added extra columns collecting data from the external resources authorized the organizers. The links of the external datasets are:
[1] Weather data https://pypi.org/project/wwo-hist/
[2] UV Index data https://pyowm.readthedocs.io/en/latest/v3/uv-api-usage-examples.html
After merging these 3 files together based on the same dates, we finalized our main dataset `smart_road_measurements_new_d_weather.csv` on top of which we will build our model after preprocessing.
"""
df = pd.read_csv("/content/smart_road_measurements_new_d_weather.csv", header=0)
df2 = df.copy()
df.head(15)
"""# Exploratory Data Analysis
Our dataset contains 349613 rows and 29 columns
"""
df.shape
df.info()
import numpy as np
np.random.seed(0)
import seaborn as sns
sns.set_theme()
_ = sns.heatmap(df2.iloc[:,2:11].corr())
_ = sns.heatmap(df2.corr())
"""We want to predict Friction of the road by weather conditions. So,
this is a classification task. Every day the car drives on a new route.
This means that all 11 days we receive data on new road sections. So, the
only link between the road sections is the average weather conditions.
This can be achieved by filtering the rows on **Microsoft Excel** for each date and get the total distance covered (the last row on each date because the column is cumulative in nature)
**Max Distance traveled, Date**
42441, 16/02/2021
92311, 17/02/2021
150216, 18/02/2021
39007, 19/02/2021
71358, 22/02/2021
81999, 23/02/2021
55958, 24/02/2021
77315, 25/02/2021
55647, 26/02/2021
61534, 1/03/2021
12409, 2/03/2021
**Therefore, we can see from the above data that for all 11 days the car was driving at different routes**
* We drop the `Distance` because the condition of the road does not depend on how much the car has traveled before. We use this column to get the speed and slope of the road.
* This means that we are using normalized data + lag (time-series
classification with engineered features instead of time-series
classification with deep learning, because we have shallow data).
We won't focus on any complicated models, just XGBClassifier to win.
* Now we need to define at what Friction the road is dangerous (label 0),
requires caution (label-1) and safe (label-2).
Ta, Tsurf, friction are **highly correlated** which has been shown in our pandas profiling
https://krutsylo.neocities.org/SmartRoads/pandas3.html of the smart road dataset.
Yet we'll drop State, Height, Distance, Ta, Tsurf, Water, moon-illumination, uvIndex columns
"""
df = df.drop("Height", axis=1) # contain N/A
df = df.drop("Distance", axis=1)
df = df.drop("State", axis=1)
df = df.drop("Ta", axis=1)
df = df.drop("Tsurf", axis=1)
df = df.drop("Water", axis=1)
df = df.drop("moon_illumination", axis=1)
df = df.drop("uvIndex", axis=1)
df.head()
""" We have grouped the data by calculating the mean of the rows in each hour based on the individual dates. For instance, if there are 8 rows for each hour, we calculated the mean of 8 rows and thus converted into a single row belonging to the distinct dates.
We also avoided duplicates to reduce the noise in the data.
"""
df['Time(+01:00)'] = pd.to_datetime(df['Time(+01:00)'], format='%H:%M:%S').dt.hour
df = df.groupby(['Date','Time(+01:00)']).mean()
df = df.drop_duplicates()
df.head()
"""Now we will work on the target feature that is `Friction` column to accomplish our objective since we want to perform a supervised machine learning model. Here we applied our knowledge of physics and research capabilities.
Icy: These roads typically have the lowest coefficient of friction. For drivers, this is the most dangerous surface to be on. The small coefficient of friction gives the driver the least amount of traction when accelerating, braking, or turning (which has angular acceleration). Icy roads have a frictional coefficient of around 0.1.
Wet: Roads wet with water have a coefficient of friction of around .4. This is around 4 times higher than an icy road. Although these roads are much safer to drive on, there is still the possibility of hydroplaning. Hydroplaning occurs when there is standing or flowing water on the road (typically from rainfall) that causes a tire to lose contact with the road's surface. The treads are designed to allow water to fill the crevices so that contact may be maintained between the road and the tire. However, if there is too much water, this may not be achieved, and hydroplaning will occur. This is precisely the reason that racing slicks have such a high coefficient of friction on dry roads (about .9) and a much lower coefficient on wet roads (as low as .1).
Dry: Roads without precipitation are considered optimal for driving conditions. They have the highest coefficient of friction, around 0.9, which creates the most traction. This allows corners, acceleration, and braking to reach higher values without loss of control. Oftentimes, if roads are not dry, races will be canceled due to the extreme dangers that a less than optimal frictional surface can pose.
So, we'll take (0 <= friction < 0.5) as *dangerous*, and (0.5 < friction <= 1) as *safe*
"""
bins = [0, 0.5, 1]
labels = [0, 1]
df["Friction"] = pd.cut(df["Friction"], bins, labels=labels)
#df = df.drop("Date", axis=1)
#df = df.drop("Time(+01:00)", axis=1)
df.head()
"""Now we'll perform lagging and forecasting feature columns by shifting simply using our pre-defined `transform_to_supervise` function."""
df = transform_to_supervised(df, previous_steps=24, forecast_steps=1, dropnan=True)
Y = df.loc[:, "Friction(t)"].to_numpy()
cols = [c for c in df.columns if '(t)' not in c]
data=df[cols]
data['Friction'] = Y
data.to_csv('/content/test.csv')
data = data.values.tolist()
df[cols].head()
"""**Lag of 1 to 3 days**"""
lag = pd.read_csv('/content/lag(1-3)days.csv')
lag=lag.head(10)
lag
ax = lag.plot(x="Date", y="humidity(t-3)", kind="bar")
lag.plot(x="Date", y="humidity(t-2)", kind="bar", ax=ax, color="C2")
lag.plot(x="Date", y="humidity(t-1)", kind="bar", ax=ax, color="C3")
ax = lag.plot(x="Date", y="windspeedKmph(t-3)", kind="bar")
lag.plot(x="Date", y="windspeedKmph(t-2)", kind="bar", ax=ax, color="C2")
lag.plot(x="Date", y="windspeedKmph(t-1)", kind="bar", ax=ax, color="C3")
"""# Statistical Analysis
**Mean values of each column**
"""
mean = pd.read_csv('/content/Mean.csv')
mean.head()
ax = mean.plot(x="Date", y="windspeedKmph", kind="bar")
mean.plot(x="Date", y="DewPointC", kind="bar", ax=ax, color="C2")
mean.plot(x="Date", y="tempC", kind="bar", ax=ax, color="C3")
#mean.plot(x="Date", y="HeatIndexC", kind="bar", ax=ax, color="C4")
#mean.plot(x="Date", y="humidity", kind="bar", ax=ax, color="C4")
#mean.plot(x="Date", y="pressure", kind="bar", ax=ax, color="C5")
plt.show()
mean.columns
mean.plot(x="Date", y=['tempC'], kind="bar")
mean.plot(x="Date", y=['windspeedKmph'], kind="bar")
mean.plot(x="Date", y=['humidity'], kind="bar")
"""**Standard Deviations of each column**"""
stdev=pd.read_csv('/content/Stdev.csv')
stdev
stdev.plot(x="Date", y=['humidity'], kind="bar")
"""**Minimum values of each column**"""
pd.read_csv('/content/Min.csv')
"""**Maximum values of each column**"""
pd.read_csv('/content/Max.csv')
"""**Median values of each column**"""
med = pd.read_csv('/content/Median.csv')
med
ax = med.plot(x="Date", y="windspeedKmph", kind="bar")
med.plot(x="Date", y="DewPointC", kind="bar", ax=ax, color="C2")
med.plot(x="Date", y="tempC", kind="bar", ax=ax, color="C3")
#mean.plot(x="Date", y="HeatIndexC", kind="bar", ax=ax, color="C4")
#med.plot(x="Date", y="humidity", kind="bar", ax=ax, color="C4")
#med.plot(x="Date", y="pressure", kind="bar", ax=ax, color="C5")
"""**Quartile-1 values of each column**
25% data are less than these values
"""
pd.read_csv('/content/Q1.csv')
"""**Quartile-3 values of each column**
75% data are less than these values
"""
pd.read_csv('/content/Q3.csv')
"""**Kurtosis values of each column**
"""
krt = pd.read_csv('/content/Kurtosis.csv')
krt
ax = krt.plot(x="Date", y="windspeedKmph", kind="bar")
krt.plot(x="Date", y="DewPointC", kind="bar", ax=ax, color="C2")
krt.plot(x="Date", y="tempC", kind="bar", ax=ax, color="C3")
krt.plot(x="Date", y="humidity", kind="bar", ax=ax, color="C4")
krt.plot(x="Date", y="pressure", kind="bar", ax=ax, color="C5")
plt.show()
ax = med.plot(x="Date", y="windspeedKmph", kind="bar")
med.plot(x="Date", y="DewPointC", kind="bar", ax=ax, color="C2")
med.plot(x="Date", y="tempC", kind="bar", ax=ax, color="C3")
"""**Skewness values of each column**
"""
skw=pd.read_csv('/content/Skewness.csv')
skw
ax = skw.plot(x="Date", y="windspeedKmph", kind="bar")
skw.plot(x="Date", y="DewPointC", kind="bar", ax=ax, color="C2")
skw.plot(x="Date", y="tempC", kind="bar", ax=ax, color="C3")
skw.plot(x="Date", y="humidity", kind="bar", ax=ax, color="C4")
skw.plot(x="Date", y="pressure", kind="bar", ax=ax, color="C5")
plt.show()
"""# Model Building, Training and Prediction"""
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
print(n_test)
data = np.array(data)
return data[:-n_test, :], data[-n_test:, :]
# walk-forward validation for univariate data
def walk_forward_validation(params, data, n_test):
predictions = list()
# split dataset
train, test = train_test_split(data, n_test)
# seed history with training dataset
history = [x for x in train]
# step over each time-step in the test set
for i in range(len(test)):
# split test row into input and output columns
testX, testy = test[i, :-1], test[i, -1]
# fit model on history and make a prediction
yhat = xgboost_forecast(params, history, testX)
# store forecast in list of predictions
predictions.append(yhat)
# add actual observation to history for the next loop
history.append(test[i])
# summarize progress
print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
# estimate prediction error
error = balanced_accuracy_score(test[:, -1], predictions)
return error, test[:, -1], predictions
# fit an xgboost model and make a one step prediction
def xgboost_forecast(params, train, testX):
# transform list into array
train = np.array(train)
# split into input and output columns
trainX, trainy = train[:, :-1], train[:, -1]
# fit model
model = XGBClassifier(**params)
model.fit(trainX, trainy)
# make a one-step prediction
yhat = model.predict(np.array([testX]))
return yhat[0]
def objective(trial: Trial, data) -> float:
params = {
"booster": "gbtree",
#"tree_method": "gpu_hist",
"n_estimators": trial.suggest_int("n_estimators", 0, 1000),
"max_depth": trial.suggest_int("max_depth", 2, 10),
"reg_alpha": trial.suggest_int("reg_alpha", 0, 5),
"reg_lambda": trial.suggest_int("reg_lambda", 0, 5),
"min_child_weight": trial.suggest_int("min_child_weight", 0, 5),
"gamma": trial.suggest_int("gamma", 0, 5),
"learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.5),
"colsample_bytree": trial.suggest_discrete_uniform(
"colsample_bytree", 0.1, 1, 0.01
),
"nthread": -1,
"use_label_encoder": False,
"eval_metric": "logloss"
}
mae, y, yhat = walk_forward_validation(params, data, 20)
return mae
if not os.path.exists('output'):
os.makedirs('output')
study = optuna.create_study(
direction="maximize",
sampler=TPESampler(seed=1337),
study_name="res",
storage="sqlite:///output/res.db",
load_if_exists=True
)
study.optimize(
lambda trial: objective(trial, data),
n_trials=50,
show_progress_bar=True
)
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.to_csv("/content/res.csv", sep="\t")
print(study.best_trial)
"""# Model Evaluation
Our model's Weighted accuracy is 0.7666666666666666 or **~76.67%**.
Now our model can forecast the friction and will label it as **'0' if dangerous** and **'1' if safe** for driving.
"""