Skip to content

Commit

Permalink
Merge pull request #941 from cal-itp/find-stop-arrival-errors
Browse files Browse the repository at this point in the history
Find stop arrival errors
  • Loading branch information
tiffanychu90 authored Nov 1, 2023
2 parents 30eddcd + 368bb03 commit 64afd3b
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 145 deletions.
205 changes: 116 additions & 89 deletions rt_segment_speeds/25_interpolation_issues.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"outputs": [],
"source": [
"import dask.dataframe as dd\n",
"import geopandas as gpd\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
Expand All @@ -41,7 +42,7 @@
"id": "c5f369bb-68bf-46a2-86ad-6279872859b1",
"metadata": {},
"source": [
"## Between stops, how to find stops behaving not as expected\n",
"## Between stops, arrival times behaving not as expected\n",
"There are erroneous calculations here.\n",
"\n",
"Prior arrival time can't take place **after** arrival time. \n",
Expand Down Expand Up @@ -69,188 +70,214 @@
{
"cell_type": "code",
"execution_count": null,
"id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
"id": "d26bb970-8d32-4036-b5f1-8852e5ed4eda",
"metadata": {},
"outputs": [],
"source": [
"stop_arrivals = pd.read_parquet(\n",
" f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
" columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
")"
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823",
"id": "8556a0d8-3f05-4726-9fb2-5dd8864fe751",
"metadata": {},
"outputs": [],
"source": [
"df.error_arrival_order.value_counts()"
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0",
"id": "4e1001f7-32db-427e-859a-9987e499c327",
"metadata": {},
"outputs": [],
"source": [
"df.error_same_endpoints.value_counts()"
"pd.crosstab(df.nearest_vp_idx_monotonic, \n",
" df.stop_meters_monotonic)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11d2a032-83db-43b7-a7fb-9254a10ae524",
"id": "53219886-827b-44e0-a764-45970bf194d0",
"metadata": {},
"outputs": [],
"source": [
"df[(df.error_same_endpoints==1) & \n",
" (df.error_arrival_order==1)].shape"
"pd.crosstab(df.nearest_vp_idx_monotonic, \n",
" df.stop_meters_monotonic, normalize=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ea55941-8d10-49f2-8765-6b2faba4080d",
"id": "7578edfa-e7a1-4607-8fa5-1d086efdef1c",
"metadata": {},
"outputs": [],
"source": [
"trip_stats = (df.groupby(\"trip_instance_key\", \n",
" observed=True, group_keys=False)\n",
" .agg({\n",
" \"error_same_endpoints\": \"mean\",\n",
" \"error_arrival_order\": \"mean\"\n",
" }).reset_index()\n",
" )"
"# Case 1: this is the largest group of errors, and \n",
"# should be easier to fix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8",
"id": "b923bfdf-df03-49cf-94bf-f5ee2270714d",
"metadata": {},
"outputs": [],
"source": [
"# Very few trips are completely error-free\n",
"trip_stats[(trip_stats.error_same_endpoints==0) & \n",
" (trip_stats.error_arrival_order==0)].shape"
"df[(df.nearest_vp_idx_monotonic==False) &\n",
" (df.stop_meters_monotonic==True)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
"id": "689e546b-7df6-49e3-bbcc-7594a41fd32e",
"metadata": {},
"outputs": [],
"source": [
"#trip_stats.sample(10).trip_instance_key.unique()\n",
"subset_trip_keys = [\n",
" '9fad69264acd8387150f45b27d4b2d09',\n",
" '44a55d2fa2588a479065ef7702475ef1',\n",
" '36070a2428e62b96368d072eb2a8fc1b',\n",
" '7f665900c6b0879f4b9bda43b93fefe3',\n",
" '8e8ba9993d52388539d06a46710c1dbc',\n",
" 'b301c2170c1ca49bbc1a9b600cccf643',\n",
" '9373f5b0de977a718dea50fd90443619',\n",
" '8415b3949147c9dc3d5ceb37863440b1',\n",
" '984f598419c1d0830ef4618d495c1bd7',\n",
" '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
"]"
"df[df.stop_meters_monotonic==False][[\n",
" \"stop_sequence\", \n",
" \"nearest_vp_idx\",\n",
" \"rolling_nearest_vp_idx\", \"nearest_vp_idx_monotonic\", \n",
" \"stop_meters\", \"rolling_stop_meters\", \n",
" \"stop_meters_monotonic\"\n",
"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "212eaa5d-735c-4332-b25d-e1883ee48f15",
"id": "ccf433cf-69e7-476c-a64a-8c999a53858b",
"metadata": {},
"outputs": [],
"source": [
"def check_if_surrounding_points_are_ok(df: pd.DataFrame):\n",
" grouped_df = df.groupby(\"trip_instance_key\", \n",
" observed=True, group_keys=False\n",
" )\n",
" df = df.assign(\n",
" prior_error = (grouped_df\n",
" .error_arrival_order\n",
" .shift(1)\n",
" ),\n",
" subseq_error = (grouped_df\n",
" .error_arrival_order\n",
" .shift(-1)\n",
" )\n",
" )\n",
" \n",
" df = df.assign(\n",
" can_be_fixed = df.apply(\n",
" lambda x:\n",
" 1 if (x.error_arrival_order==1) and\n",
" (x.prior_error==0) and (x.subseq_error==0)\n",
" else 0, axis=1\n",
" )\n",
" )\n",
"# How to use stop arrivals to constrain the wrong arrival times that occur\n",
"# in the middle of the trip?\n",
"stop_arrivals = pd.read_parquet(\n",
" f\"{SEGMENT_GCS}{STOP_ARRIVALS}.parquet\",\n",
" columns = [\"trip_instance_key\", \"stop_sequence\", \"arrival_time\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f80f709-1a58-4608-af7e-4295ad647bdb",
"metadata": {},
"outputs": [],
"source": [
"trips_monotonicity = (stop_arrivals.groupby(\"trip_instance_key\")\n",
" .arrival_time\n",
" .is_monotonic_increasing\n",
" ).to_frame().reset_index()\n",
"\n",
" return df\n",
" "
"trips_monotonicity"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
"id": "3d6adbc5-3959-448f-ae35-c1fca40848c7",
"metadata": {},
"outputs": [],
"source": [
"df2 = pd.merge(\n",
" df,\n",
" stop_arrivals,\n",
" on = [\"trip_instance_key\", \"stop_sequence\"],\n",
" how = \"inner\"\n",
")"
"trips_monotonicity.arrival_time.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5b1f434-cb45-4425-aa8f-7a85c87d3e8d",
"id": "3d8da9cc-957f-4d4f-8e15-b84f1b9f70b1",
"metadata": {},
"outputs": [],
"source": [
"df3 = check_if_surrounding_points_are_ok(df2)"
"fail_trips = trips_monotonicity[\n",
" trips_monotonicity.arrival_time==False\n",
"].sample(25).trip_instance_key.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c47138d-9129-43ff-b73a-4c494f5be58a",
"id": "8f96f7eb-1083-4eb6-a76d-70debce26884",
"metadata": {},
"outputs": [],
"source": [
"df3[df3.error_arrival_order==1].shape"
"stop_arrivals[stop_arrivals.trip_instance_key==fail_trips[7]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d913f05d-2df8-4a92-bc6c-3dd3d2e78a37",
"id": "b06b3f04-73eb-4eb3-b8fe-9ae49c8a1c3a",
"metadata": {},
"outputs": [],
"source": [
"df3[(df3.error_arrival_order==1) & \n",
" (df3.prior_error==0) & \n",
" (df3.subseq_error==0)\n",
" ].shape"
"import altair as alt\n",
"\n",
"def plot_stop_arrivals(df, one_trip):\n",
" chart = (alt.Chart(df[df.trip_instance_key==one_trip])\n",
" .mark_line()\n",
" .encode(\n",
" x=\"stop_sequence\",\n",
" y=\"arrival_time:T\",\n",
" tooltip=[\"stop_sequence\", \"arrival_time\"]\n",
" ).properties(title=one_trip)\n",
" .interactive()\n",
" )\n",
" \n",
" display(chart)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58ecf1aa-8833-44f4-a286-e5c87e67b440",
"metadata": {},
"outputs": [],
"source": [
"for t in fail_trips:\n",
" plot_stop_arrivals(stop_arrivals, t)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
"metadata": {},
"outputs": [],
"source": [
"#trip_stats.sample(10).trip_instance_key.unique()\n",
"subset_trip_keys = [\n",
" '9fad69264acd8387150f45b27d4b2d09',\n",
" '44a55d2fa2588a479065ef7702475ef1',\n",
" '36070a2428e62b96368d072eb2a8fc1b',\n",
" '7f665900c6b0879f4b9bda43b93fefe3',\n",
" '8e8ba9993d52388539d06a46710c1dbc',\n",
" 'b301c2170c1ca49bbc1a9b600cccf643',\n",
" '9373f5b0de977a718dea50fd90443619',\n",
" '8415b3949147c9dc3d5ceb37863440b1',\n",
" '984f598419c1d0830ef4618d495c1bd7',\n",
" '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e979265a-780d-496e-b3b0-195cc5058d2b",
"id": "5a219a60-4ea2-45b0-9fa9-3a11f326b8a0",
"metadata": {},
"outputs": [],
"source": [
"df3[df3.can_be_fixed==1].trip_instance_key.unique()[:5]"
"df2 = pd.merge(\n",
" df,\n",
" stop_arrivals,\n",
" on = [\"trip_instance_key\", \"stop_sequence\"],\n",
" how = \"inner\"\n",
")"
]
},
{
Expand All @@ -260,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
"df3[df3.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
"df2[df2.trip_instance_key==\"00019686e6c7bf335148c8d290feb285\"]"
]
},
{
Expand All @@ -270,7 +297,7 @@
"metadata": {},
"outputs": [],
"source": [
"df3[df3.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
"df2[df2.trip_instance_key==\"0001ad7e1ef246cf6d68599de0fdcaad\"\n",
" ].tail(10)"
]
},
Expand Down Expand Up @@ -309,16 +336,16 @@
" .mark_line()\n",
" .encode(\n",
" x=\"stop_sequence\",\n",
" y=\"error_arrival_order\"\n",
" y=\"nearest_vp_idx_monotonic\"\n",
" ).properties(title=f\"{t}\")\n",
" )\n",
" display(chart)\n",
" \n",
" chart2 = (alt.Chart(subset_df[subset_df.error_arrival_order == 0])\n",
" chart2 = (alt.Chart(subset_df[subset_df.nearest_vp_idx_monotonic == True])\n",
" .mark_line()\n",
" .encode(\n",
" x=\"stop_sequence\",\n",
" y=\"error_same_endpoints\"\n",
" y=\"stop_meters_monotonic\"\n",
" )\n",
" )\n",
" display(chart2)"
Expand Down Expand Up @@ -696,7 +723,7 @@
"outputs": [],
"source": [
"df = pd.read_parquet(\n",
" f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")"
" f\"{SEGMENT_GCS}speed_stop_segments_{analysis_date}.parquet\")"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions rt_segment_speeds/logs/interpolate_stop_arrival.log
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
2023-10-31 12:12:52.626 | INFO | __main__:<module>:99 - Analysis date: 2023-09-13
2023-10-31 12:14:03.894 | INFO | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:11.267039
2023-10-31 12:14:57.365 | INFO | __main__:<module>:139 - interpolate stop arrival: 0:00:53.471494
2023-10-31 12:15:05.266 | INFO | __main__:<module>:145 - execution time: 0:02:12.638916
2023-10-31 18:10:00.239 | INFO | __main__:<module>:99 - Analysis date: 2023-09-13
2023-10-31 18:11:18.958 | INFO | __main__:<module>:134 - set up df with nearest / subseq vp info: 0:01:18.690602
2023-10-31 18:12:06.833 | INFO | __main__:<module>:139 - interpolate stop arrival: 0:00:47.874819
2023-10-31 18:12:14.756 | INFO | __main__:<module>:145 - execution time: 0:02:14.488207
15 changes: 7 additions & 8 deletions rt_segment_speeds/logs/nearest_vp.log
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
2023-10-31 at 09:34:59 | INFO | Analysis date: 2023-09-13
2023-10-31 09:39:45.702 | INFO | __main__:<module>:261 - map partitions to transform vp: 0:04:46.103748
2023-10-31 at 09:39:45 | INFO | map partitions to transform vp: 0:04:46.103748
2023-10-31 09:39:46.981 | INFO | __main__:<module>:293 - map partitions to find nearest vp to stop: 0:00:01.279908
2023-10-31 at 09:39:46 | INFO | map partitions to find nearest vp to stop: 0:00:01.2799082023-10-31 09:46:23.878 | INFO | __main__:<module>:316 - Analysis date: 2023-09-13
2023-10-31 09:51:11.125 | INFO | __main__:find_nearest_vp_to_stop:261 - map partitions to transform vp: 0:04:47.246718
2023-10-31 09:51:11.894 | INFO | __main__:find_nearest_vp_to_stop:293 - map partitions to find nearest vp to stop: 0:00:00.768417
2023-10-31 09:57:34.934 | INFO | __main__:<module>:323 - execution time: 0:11:11.055258
2023-10-31 17:45:52.135 | INFO | __main__:<module>:332 - Analysis date: 2023-09-13
2023-10-31 17:51:23.974 | INFO | __main__:find_nearest_vp_to_stop:277 - map partitions to transform vp: 0:05:31.838490
2023-10-31 17:51:25.093 | INFO | __main__:find_nearest_vp_to_stop:309 - map partitions to find nearest vp to stop: 0:00:01.118975
2023-10-31 17:57:10.858 | INFO | __main__:<module>:337 - Analysis date: 2023-09-13
2023-10-31 18:03:30.506 | INFO | __main__:find_nearest_vp_to_stop:282 - map partitions to transform vp: 0:06:19.646465
2023-10-31 18:03:31.676 | INFO | __main__:find_nearest_vp_to_stop:314 - map partitions to find nearest vp to stop: 0:00:01.170538
2023-10-31 18:08:58.296 | INFO | __main__:<module>:344 - execution time: 0:11:47.436826
2 changes: 2 additions & 0 deletions rt_segment_speeds/logs/speeds_by_segment_trip.log
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
2023-10-17 18:34:34.838 | INFO | __main__:<module>:378 - execution time: 0:10:16.928330
2023-10-31 12:29:06.200 | INFO | __main__:<module>:23 - Analysis date: 2023-09-13
2023-10-31 12:29:29.129 | INFO | __main__:<module>:69 - execution time: 0:00:22.926565
2023-10-31 18:12:34.943 | INFO | __main__:<module>:23 - Analysis date: 2023-09-13
2023-10-31 18:12:57.436 | INFO | __main__:<module>:69 - execution time: 0:00:22.465316
Loading

0 comments on commit 64afd3b

Please sign in to comment.