-
Notifications
You must be signed in to change notification settings - Fork 1
/
flight_delay.py
65 lines (45 loc) · 2.27 KB
/
flight_delay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""Flight Delay.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1aKpcvbX-OYX9F1Hu-LEz-mDr2PwYEg0H
"""
import pandas as pd
import numpy as np
from seaborn import jointplot, heatmap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
#from google.colab import drive
#drive.mount('/content/drive')
flight = pd.read_csv('/content/flights.data', low_memory=False)
flight.head()
"""Exploratory Data Analysis"""
flight.to_csv('/content/flights.data', sep=',' , mode='w')
flight.describe()
flight.info()
jointplot(data = flight, x = 'SCHEDULED_ARRIVAL', y = 'ARRIVAL_TIME')
"""Data Cleaning and Preprocessing"""
flight.drop(['DAY', 'MONTH', 'YEAR', 'DAY_OF_WEEK', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIR_TIME', 'SCHEDULED_TIME', 'DISTANCE', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON'], axis=1, inplace=True)
flight.isna().sum()
flight.dropna(subset=['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'], inplace=True)
flight.fillna(flight.mean(), inplace=True)
plt.figure(figsize=(16,8))
heatmap(flight.corr(), annot=True)
plt.show()
flight = flight[['DEPARTURE_DELAY', 'AIR_SYSTEM_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'ARRIVAL_DELAY']]
x = flight[['DEPARTURE_DELAY', 'AIR_SYSTEM_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']] # Choosing highly correlated values
y = flight['ARRIVAL_DELAY']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
flight.to_csv('drive/MyDrive/flights-truncated.data', sep=',' , mode='w')
lr = LinearRegression()
pf = PolynomialFeatures(degree=5)
pipe = Pipeline([('pf', pf), ('lr', lr)])
pipe.fit(X_train, y_train)
print(mean_squared_error(pipe.predict(X_test), y_test))
print(*pipe.predict(np.array([92, 0, 0, 85, 0]).reshape(1, -1))) #'DEPARTURE_DELAY', 'AIR_SYSTEM_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'
import pickle
pickle.dump(pipe, open('flights.pkl', 'wb'))