-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3.3. Autoregressive Models
112 lines (83 loc) · 3.35 KB
/
3.3. Autoregressive Models
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from IPython.display import VimeoVideo
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg
warnings.simplefilter(action="ignore", category=FutureWarning)
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]
def wrangle(collection):
results = collection.find(
{"metadata.site": 29, "metadata.measurement": "P2"},
projection={"P2": 1, "timestamp": 1, "_id": 0},
)
# Read data into DataFrame
df = pd.DataFrame(list(results)).set_index("timestamp")
# Localize timezone
df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
# Remove outliers
df = df[df["P2"] < 500]
# Resample to 1hr window, do not convert to data frame, keep as a series
y = df["P2"].resample("1H").mean().fillna(method='ffill')
return y
y = wrangle(nairobi)
y.head()
fig, ax = plt.subplots(figsize=(15, 6)) #ACF (Auto Correlation Factor plot)
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");
fig, ax = plt.subplots(figsize=(15, 6))#PACF (Partial Auto Correlation Factor plot)
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");
cutoff_test = int(len(y) * 0.95) #Splitting the data
y_train = y.iloc[:cutoff_test]
y_test = y.iloc[cutoff_test:]
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
model = AutoReg(y_train, lags=26).fit() #Model initialization and training in one row
y_pred = model.predict().dropna()
training_mae = mean_absolute_error(y_train.iloc[26:], y_pred)
print("Training MAE:", training_mae)
y_train_resid = model.resid #getting the residuals
y_train_resid.tail()
fig, ax = plt.subplots(figsize=(15, 6))
y_train_resid.plot(ylabel="Residual Value", ax=ax); #time series plot of residuals
y_train_resid.hist() #histogram of residuals
plt.xlabel("Residual Values")
plt.ylabel("Frequency")
plt.title("AR(26), Distribution of Residuals");
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(y_train_resid, ax=ax)
y_pred_test = model.predict(y_test.index.min(), y_test.index.max()) #Model evaluation for 1st and last observation
test_mae = mean_absolute_error(y_test, y_pred_test)
print("Test MAE:", test_mae)
df_pred_test = pd.DataFrame(
{"y_test": y_test, "y_pred": y_pred_test}, index=y_test.index
)
fig = px.line(df_pred_test, labels={"value": "P2"})
fig.show()
%%capture
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
model =AutoReg(history, lags=26).fit()
next_pred = model.forecast()
y_pred_wfv = y_pred_wfv.append(next_pred)
history = history.append(y_test[next_pred.index])
test_mae = mean_absolute_error(y_test, y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))
print(model.params) # AR model coefficients
df_pred_test = pd.DataFrame( # preparing the df for plot
{"y_test": y_test, "y_pred_wfv": y_pred_wfv}, index=y_test.index
)
fig = px.line(df_pred_test, labels={"value": "PM2.5"}) #plotting the results
fig.show()