-
Notifications
You must be signed in to change notification settings - Fork 0
/
CIV516_Analysis.sql
293 lines (285 loc) · 9.81 KB
/
CIV516_Analysis.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/* CIV516 PROJECT WORK */
/* FIND the minimum and maximum stop sequences by trip */
DECLARE target_pattern STRING DEFAULT '52F'; /* INPUT THE BRANCH TO ANALYZE */
/* CREATE A TABLE OF THE START AND ENDS OF AVL TRIPS */
/* PROCESS DESCRIPTION
1. Get the max and min stop sequences for each trip and day in the AVL data
2. Filter only these max and min records in the AVL data
3. Join the shape_dist_travelled fields to both types of records
4. LEAD on the min_ss to get the max_ss sdt and other parameters
5. Filter only min_ss
*/
DROP TABLE IF EXISTS ts2.route52_startends;
CREATE TABLE ts2.route52_startends AS
SELECT
*
FROM
(
SELECT
*,
LEAD( ActualTime_ArrivalStop, 1 ) OVER ( PARTITION BY Date_Key, TripId ORDER BY StopSequence ) AS final_arrival_time, --Grab the end stop
LEAD(
arrival_stop_sdt, 1 )
OVER(
PARTITION BY Date_Key, TripId ORDER BY StopSequence
) AS final_sdt,
LEAD(
StopSequence, 1 )
OVER(
PARTITION BY Date_Key, TripId ORDER BY StopSequence
) AS final_ss,
LEAD( ArrivalStop, 1 ) OVER ( PARTITION BY Date_Key, TripId ORDER BY StopSequence ) AS final_stop_name,
LEAD( ActualTime_ArrivalStop, 1 ) OVER ( PARTITION BY Date_Key, TripId ORDER BY StopSequence ) AS final_stop_arrival_time
FROM
(
SELECT
avl.*, st1.shape_dist_traveled AS departure_stop_sdt, st2.shape_dist_traveled AS arrival_stop_sdt
FROM
(
SELECT
avl.*, mm.max_ss, mm.min_ss
FROM
ts2.avl_data avl
INNER JOIN
(
SELECT
Date_Key, TripId, MAX( StopSequence ) AS max_ss, MIN( StopSequence ) AS min_ss
FROM
ts2.avl_data
WHERE
PatternName = target_pattern
GROUP BY
Date_Key, TripId
) mm
ON
avl.Date_Key = mm.Date_Key
AND
avl.TripId = mm.TripId
AND
(avl.StopSequence = mm.max_ss OR avl.StopSequence = mm.min_ss )
) avl
INNER JOIN
may2018_gtfs.stops s1
ON
CAST(avl.DepartureStopNumber AS STRING) = s1.stop_code
INNER JOIN
may2018_gtfs.stops s2
ON
CAST(avl.ArrivalStopNumber AS STRING) = s2.stop_code --this might run into issues
INNER JOIN
may2018_gtfs.stop_times st1
ON
s1.stop_id = CAST(st1.stop_id AS INT64)
AND
avl.TripId = CAST(st1.trip_id AS INT64)
INNER JOIN
may2018_gtfs.stop_times st2
ON
s2.stop_id = CAST(st2.stop_id AS INT64)
AND
avl.TripId = CAST(st2.trip_id AS INT64)
)
)
WHERE
StopSequence = min_ss
;
/* CREATE A TABLE OF OPERATING SPEEDS, TO SLICE AND DICE LATER */
DROP TABLE IF EXISTS ts2.oneway_line_operating_speeds;
CREATE TABLE ts2.oneway_line_operating_speeds AS
SELECT
*,
(final_sdt - departure_stop_sdt) * 1000 AS operating_dist_m,
TIME_DIFF(
EXTRACT( TIME FROM final_stop_arrival_time) , EXTRACT( TIME FROM ActualTime_DepartureStop) , SECOND
) AS operating_time_s,
CASE
WHEN TIME_DIFF( EXTRACT( TIME FROM final_stop_arrival_time), EXTRACT( TIME FROM ActualTime_DepartureStop), SECOND) = 0 THEN NULL
ELSE ( final_sdt - departure_stop_sdt) * 1000 / TIME_DIFF( EXTRACT( TIME FROM final_stop_arrival_time), EXTRACT( TIME FROM ActualTime_DepartureStop), SECOND) * 3.6
END AS operating_speed_kph
FROM
ts2.route52_startends
;
/* SLICE AND DICE ONE WAY LINE OPERATING SPEEDS */
/* 1.0 Overall Average Operating Speed */
SELECT
COUNT(*)
FROM
ts2.oneway_line_operating_speeds
WHERE
operating_speed_kph IS NOT NULL
AND
operating_speed_kph > 0
; --RESULT: 17.988
SELECT
avg( operating_speed_kph ) AS overall_avg_operating_speed_kph
FROM
ts2.oneway_line_operating_speeds
WHERE
operating_speed_kph IS NOT NULL
AND
operating_speed_kph > 0
; --RESULT: 17.988
SELECT * FROM ts2.oneway_line_operating_speeds LIMIT 5;
/* 2.0 Average operating speed by start and end */
SELECT
DepartureStop, final_stop_name, COUNT(*) AS sample_size, AVG( operating_dist_m ) AS avg_operating_dist_m , AVG( operating_speed_kph ) AS avg_operating_speed_kph , AVG( operating_time_s ) AS avg_operating_time_s
FROM
ts2.oneway_line_operating_speeds
GROUP BY
DepartureStop, final_stop_name
ORDER BY
AVG( operating_speed_kph )
;
/* 3.0 Stop to Stop Level Analysis */
DROP TABLE IF EXISTS ts2.stop_to_stop_stats;
CREATE TABLE ts2.stop_to_stop_stats AS
SELECT
*
FROM
(
SELECT
*,
CASE
WHEN raw_travel_time_s = 0 THEN NULL
ELSE sdt_diff_m / raw_travel_time_s * 3.6
END AS stop2stop_speed_kph
FROM
(
SELECT
avl.*, st1.shape_dist_traveled AS departure_sdt, st2.shape_dist_traveled AS arrival_sdt, ( st2.shape_dist_traveled - st1.shape_dist_traveled ) * 1000 AS sdt_diff_m,
/* CALCULATE TRAVEL TIME -- THINGS GET COMPLICATED HERE */
TIME_DIFF( EXTRACT( TIME FROM ActualTime_ArrivalStop ) , EXTRACT( TIME FROM ActualTime_DepartureStop ), SECOND ) AS raw_travel_time_s
FROM
ts2.avl_data avl
INNER JOIN
may2018_gtfs.stops s1
ON
CAST(avl.DepartureStopNumber AS STRING ) = s1.stop_code
INNER JOIN
may2018_gtfs.stops s2
ON
CAST( avl.ArrivalStopNumber AS STRING ) = s2.stop_code
INNER JOIN
may2018_gtfs.stop_times st1
ON
avl.TripId = CAST( st1.trip_id AS INT64 )
AND
s1.stop_id = CAST( st1.stop_id AS INT64 )
INNER JOIN
may2018_gtfs.stop_times st2
ON
avl.TripId = CAST( st2.trip_id AS INT64 )
AND
s2.stop_id = CAST( st2.stop_id AS INT64 )
)
)
WHERE
stop2stop_speed_kph IS NOT NULL
;
/* 4.0 On Time Performance */
-- Each row in the avl data represents a departure and arrival, which may adhere to the schedule. Let's summarise these overall.
--Apply a bin label based on the scheduled adherence.
SELECT
d_grades.departure_adherence_grade AS adherence_grade,
(d_grades.grade_count + a_grades.grade_count ) / (d_grades.count_total + a_grades.count_total ) * 100.0 AS total_grade_percent,
d_grades.grade_percent AS departure_only_perc ,
a_grades.grade_percent AS arrival_only_perc
FROM
(
SELECT
departure_adherence_grade, COUNT(*) AS grade_count, ( SELECT COUNT(*) FROM ts2.avl_data WHERE PatternName = target_pattern ) AS count_total, COUNT(*) / ( SELECT COUNT(*) FROM ts2.avl_data WHERE PatternName = target_pattern ) * 100 as grade_percent
FROM
(
SELECT
*,
CASE
WHEN ScheduleAdherence_DepartureStop_Seconds_ BETWEEN -60 AND 180 THEN 'GRADE A' -- 1 min early to 3 min late
WHEN ScheduleAdherence_DepartureStop_Seconds_ BETWEEN -120 AND 360 THEN 'GRADE B' -- 2 min early to 6 min late
WHEN ScheduleAdherence_DepartureStop_Seconds_ BETWEEN -240 AND 720 THEN 'GRADE C' -- 4 min early to 12 min late
ELSE 'GRADE D'
END AS departure_adherence_grade
FROM
ts2.avl_data
WHERE
PatternName = target_pattern
)
GROUP BY
departure_adherence_grade
) d_grades
INNER JOIN
(
SELECT
arrival_adherence_grade, COUNT(*) AS grade_count, ( SELECT COUNT(*) FROM ts2.avl_data WHERE PatternName = target_pattern ) AS count_total, COUNT(*) / ( SELECT COUNT(*) FROM ts2.avl_data WHERE PatternName = target_pattern ) * 100 as grade_percent
FROM
(
SELECT
*,
CASE
WHEN ScheduleAdherence_ArrivalStop_Seconds_ BETWEEN -60 AND 180 THEN 'GRADE A' -- 1 min early to 3 min late
WHEN ScheduleAdherence_ArrivalStop_Seconds_ BETWEEN -120 AND 360 THEN 'GRADE B' -- 2 min early to 6 min late
WHEN ScheduleAdherence_ArrivalStop_Seconds_ BETWEEN -240 AND 720 THEN 'GRADE C' -- 4 min early to 12 min late
ELSE 'GRADE D'
END AS arrival_adherence_grade
FROM
ts2.avl_data
WHERE
PatternName = target_pattern
)
GROUP BY
arrival_adherence_grade
) a_grades
ON
d_grades.departure_adherence_grade = a_grades.arrival_adherence_grade
;
/* MISC ----- 5.0 ADD DAY TYPE TO THE OPERATING SPEED */
DROP TABLE IF EXISTS ts2.oneway_line_operating_speeds_with_dt;
CREATE TABLE ts2.oneway_line_operating_speeds_with_dt AS
SELECT
*,
/* ADD DAY TYPE */
DATE(
CONCAT(
LEFT( CAST( Date_Key AS STRING) , 4 ),
'-',
SUBSTR( CAST( Date_Key AS STRING), 5, 2 ),
'-',
RIGHT( CAST( Date_Key AS STRING), 2 )
)
) AS Date_format_key,
CASE
WHEN
EXTRACT(DAYOFWEEK FROM
DATE(
CONCAT(
LEFT( CAST( Date_Key AS STRING) , 4 ),
'-',
SUBSTR( CAST( Date_Key AS STRING), 5, 2 ),
'-',
RIGHT( CAST( Date_Key AS STRING), 2 )
)
) ) = 1 THEN 'SUNDAY'
WHEN
EXTRACT(DAYOFWEEK FROM
DATE(
CONCAT(
LEFT( CAST( Date_Key AS STRING) , 4 ),
'-',
SUBSTR( CAST( Date_Key AS STRING), 5, 2 ),
'-',
RIGHT( CAST( Date_Key AS STRING), 2 )
)
) ) IN ( 2, 3, 4, 5, 6 ) THEN 'WEEKDAY'
WHEN
EXTRACT(DAYOFWEEK FROM
DATE(
CONCAT(
LEFT( CAST( Date_Key AS STRING) , 4 ),
'-',
SUBSTR( CAST( Date_Key AS STRING), 5, 2 ),
'-',
RIGHT( CAST( Date_Key AS STRING), 2 )
)
) ) = 7 THEN 'SATURDAY'
END AS day_type
FROM
ts2.oneway_line_operating_speeds