From ef6087866918336e228389e04fa97f0afbe1827c Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Tue, 31 Oct 2023 17:47:44 +0000 Subject: [PATCH] look where arrival order is wrong --- .../25_interpolation_issues.ipynb | 573 ++++++++++++++++++ .../scripts/handle_common_errors.py | 3 +- 2 files changed, 575 insertions(+), 1 deletion(-) create mode 100644 rt_segment_speeds/25_interpolation_issues.ipynb diff --git a/rt_segment_speeds/25_interpolation_issues.ipynb b/rt_segment_speeds/25_interpolation_issues.ipynb new file mode 100644 index 000000000..6cf960cb3 --- /dev/null +++ b/rt_segment_speeds/25_interpolation_issues.ipynb @@ -0,0 +1,573 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0a0bbc47-639d-47f7-a051-d4a2f19f7dbc", + "metadata": {}, + "source": [ + "# Interpolation method - break down steps to debug\n", + "* When finding nearest vp, there are stops that have later arrival times coming before earlier arrivals.\n", + " * stop 1 arrives after stop 2. this can't happen in real life.\n", + " * filter down to these stops and try again\n", + " * how do we distinguish between whether stop 1 or stop 2 is the \"correct\" one?\n", + "* Want to add direction to deal with loop or inlining shapes --> maybe this can be expanded to all shapes\n", + " * Do not even allow opposite direction vp to be selected as nearest vp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef830f30-e979-4916-857e-08abe9762888", + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from segment_speed_utils import helpers, wrangle_shapes\n", + "from segment_speed_utils.project_vars import SEGMENT_GCS, PROJECT_CRS \n", + " \n", + "from shared_utils import rt_dates\n", + "\n", + "analysis_date = rt_dates.DATES[\"sep2023\"]\n", + "\n", + "STOP_SEG_DICT = helpers.get_parameters(\n", + " \"./scripts/config.yml\", \"stop_segments\")" + ] + }, + { + "cell_type": "markdown", + "id": "c5f369bb-68bf-46a2-86ad-6279872859b1", + "metadata": {}, + "source": [ + "## Between stops, how to find stops behaving not as expected\n", + "There are erroneous calculations here.\n", + "\n", + "Prior arrival time can't take place **after** arrival time. \n", + "\n", + "Ex: `trip_instance_key == \"0000412e74a0e91993ce66bcbc4e3e73\"`, stop 1's nearest position is after stop 2's nearest position.\n", + "\n", + "Handle these along with loop or inlining shapes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19614462-03ca-4a84-a119-3a38ca719b0d", + "metadata": {}, + "outputs": [], + "source": [ + "NEAREST_VP = f\"{STOP_SEG_DICT['stage2']}_error_{analysis_date}\"\n", + "\n", + "df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}projection/{NEAREST_VP}.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823", + "metadata": {}, + "outputs": [], + "source": [ + "df.error_arrival_order.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0", + "metadata": {}, + "outputs": [], + "source": [ + "df.error_same_endpoints.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11d2a032-83db-43b7-a7fb-9254a10ae524", + "metadata": {}, + "outputs": [], + "source": [ + "df[(df.error_same_endpoints==1) & \n", + " (df.error_arrival_order==1)].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ea55941-8d10-49f2-8765-6b2faba4080d", + "metadata": {}, + "outputs": [], + "source": [ + "trip_stats = (df.groupby(\"trip_instance_key\", \n", + " observed=True, group_keys=False)\n", + " .agg({\n", + " \"error_same_endpoints\": \"mean\",\n", + " \"error_arrival_order\": \"mean\"\n", + " }).reset_index()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Very few trips are completely error-free\n", + "trip_stats[(trip_stats.error_same_endpoints==0) & \n", + " (trip_stats.error_arrival_order==0)].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a", + "metadata": {}, + "outputs": [], + "source": [ + "#trip_stats.sample(10).trip_instance_key.unique()\n", + "subset_trip_keys = [\n", + " '9fad69264acd8387150f45b27d4b2d09',\n", + " '44a55d2fa2588a479065ef7702475ef1',\n", + " '36070a2428e62b96368d072eb2a8fc1b',\n", + " '7f665900c6b0879f4b9bda43b93fefe3',\n", + " '8e8ba9993d52388539d06a46710c1dbc',\n", + " 'b301c2170c1ca49bbc1a9b600cccf643',\n", + " '9373f5b0de977a718dea50fd90443619',\n", + " '8415b3949147c9dc3d5ceb37863440b1',\n", + " '984f598419c1d0830ef4618d495c1bd7',\n", + " '815e4dd921cdcb61ad2dbb1ca5f08a39'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "f546357f-fdeb-4ece-bbd6-f3b9b5f53c30", + "metadata": {}, + "source": [ + "If almost every trip has errors, it matters where the arrival order error comes in. Must take the minimum stop sequence of a clear monotonic period and set that as the max to filter out vp_idx that occur afterwards." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f172f124-8221-4d65-9a06-f5cdb2e15120", + "metadata": {}, + "outputs": [], + "source": [ + "import altair as alt\n", + "\n", + "for t in subset_trip_keys:\n", + " \n", + " chart = (alt.Chart(df[df.trip_instance_key==t])\n", + " .mark_line()\n", + " .encode(\n", + " x=\"stop_sequence\",\n", + " y=\"error_arrival_order\"\n", + " ).properties(title=f\"{t}\")\n", + " )\n", + " display(chart)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcdc9773-fed6-4b81-9ccb-c868dca01535", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24a28b40-0548-412e-b21e-de7c455e7708", + "metadata": {}, + "outputs": [], + "source": [ + "nearest_array = df.groupby(\"trip_instance_key\").agg(\n", + " {\"nearest_vp_idx\": lambda x: list(x)}).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3543a028-58a2-42e8-9bdb-cda4d6e88f45", + "metadata": {}, + "outputs": [], + "source": [ + "nearest_array = nearest_array.assign(\n", + " array_diff = nearest_array.apply(\n", + " lambda x: \n", + " np.ediff1d(np.asarray(x.nearest_vp_idx)),\n", + " axis=1)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "702ea4a1-92f9-4adf-940b-b43505692613", + "metadata": {}, + "outputs": [], + "source": [ + "nearest_array = nearest_array.assign(\n", + " wrong_times = nearest_array.apply(\n", + " lambda x: 1 if len(np.where(x.array_diff < 0)[0] > 0)\n", + " else 0, axis=1\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a14c62ce-3f27-4c09-bb67-91113ba0afd4", + "metadata": {}, + "outputs": [], + "source": [ + "nearest_array.wrong_times.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "6324f8c7-4b80-4f36-801d-babc482dc1e3", + "metadata": {}, + "source": [ + "## Index into specific portions of array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90e7c452-9054-4f61-8c1f-c78526a5fd23", + "metadata": {}, + "outputs": [], + "source": [ + "one_trip = \"bf87a17838cdaff5ba78fb70edd4f1bb\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4779281-aa0d-4f12-82d8-7f8654ae491d", + "metadata": {}, + "outputs": [], + "source": [ + "def rt_trips_to_shape(analysis_date: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Filter down trip_instance_keys from schedule to \n", + " trips present in vp.\n", + " Provide shape_array_key associated with trip_instance_key.\n", + " \"\"\"\n", + " # Get RT trips\n", + " rt_trips = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n", + " filters = [[(\"trip_instance_key\", \"==\", one_trip)]],\n", + " columns = [\"trip_instance_key\"]\n", + " ).drop_duplicates()\n", + "\n", + " # Find the shape_array_key for RT trips\n", + " trip_to_shape = helpers.import_scheduled_trips(\n", + " analysis_date,\n", + " columns = [\"trip_instance_key\", \"shape_array_key\"],\n", + " get_pandas = True\n", + " ).merge(\n", + " rt_trips,\n", + " on = \"trip_instance_key\",\n", + " how = \"inner\"\n", + " )\n", + "\n", + " # Find whether it's loop or inlining\n", + " shapes_loop_inlining = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}stops_projected_{analysis_date}.parquet\",\n", + " columns = [\n", + " \"shape_array_key\", \"loop_or_inlining\", \n", + " \"stop_primary_direction\", \n", + " ]\n", + " ).drop_duplicates().merge(\n", + " trip_to_shape,\n", + " on = \"shape_array_key\",\n", + " how = \"inner\"\n", + " )\n", + " \n", + " return shapes_loop_inlining" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f3e202c-2854-4874-8f09-8074c3025f47", + "metadata": {}, + "outputs": [], + "source": [ + "trip_shape_crosswalk = rt_trips_to_shape(analysis_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c332ddfa-90d9-4a3f-bc81-005d1e277422", + "metadata": {}, + "outputs": [], + "source": [ + "vp = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n", + " filters = [[(\"trip_instance_key\", \"==\", one_trip)]],\n", + " columns = [\"trip_instance_key\", \"vp_idx\", \n", + " \"vp_primary_direction\", \n", + " ]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae73eb2d-ac83-405a-9aab-9143ec8874b3", + "metadata": {}, + "outputs": [], + "source": [ + "subset_vp = vp.vp_idx.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3172933b-6d1b-4ef7-af50-5137c638b72b", + "metadata": {}, + "outputs": [], + "source": [ + "projected_shape_meters = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}projection/vp_projected_{analysis_date}.parquet\",\n", + " filters = [[(\"vp_idx\", \"in\", subset_vp)]]\n", + ")\n", + "\n", + "vp_with_projection = pd.merge(\n", + " vp,\n", + " projected_shape_meters,\n", + " on = \"vp_idx\",\n", + " how = \"inner\"\n", + ").merge(\n", + " trip_shape_crosswalk,\n", + " on = \"trip_instance_key\",\n", + " how = \"inner\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8687b8b-5550-484b-a080-aa8f787e95be", + "metadata": {}, + "outputs": [], + "source": [ + "shape_keys = trip_shape_crosswalk.shape_array_key.unique()\n", + "\n", + "stops_projected = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}stops_projected_{analysis_date}.parquet\",\n", + " filters = [[(\"shape_array_key\", \"in\", shape_keys)]],\n", + " columns = [\"shape_array_key\", \"stop_sequence\", \"stop_id\", \n", + " \"shape_meters\", \"stop_primary_direction\"]\n", + ").rename(columns = {\"shape_meters\": \"stop_meters\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edac7256-c5ca-4bef-bf71-573b3a4d30c3", + "metadata": {}, + "outputs": [], + "source": [ + "trip_shape_cols = [\"trip_instance_key\", \"shape_array_key\"]\n", + "\n", + "trip_info = (\n", + " vp_with_projection\n", + " .groupby(trip_shape_cols, \n", + " observed=True, group_keys=False)\n", + " .agg({\n", + " \"vp_idx\": lambda x: list(x),\n", + " \"shape_meters\": lambda x: list(x),\n", + " \"vp_primary_direction\": lambda x: list(x),\n", + " })\n", + " .reset_index()\n", + " .rename(columns = {\n", + " \"vp_idx\": \"vp_idx_arr\",\n", + " \"shape_meters\": \"shape_meters_arr\",\n", + " \"vp_primary_direction\": \"vp_dir_arr\"\n", + " })\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecfba640-5c25-4866-9b6c-3c32dcb8ec63", + "metadata": {}, + "outputs": [], + "source": [ + "vp_to_stop = pd.merge(\n", + " trip_info,\n", + " stops_projected,\n", + " on = \"shape_array_key\",\n", + " how = \"inner\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aeaad1d-a191-49d9-b0d1-c5fe14f4ade5", + "metadata": {}, + "outputs": [], + "source": [ + "this_stop_direction = vp_to_stop.stop_primary_direction.iloc[0]\n", + "this_stop_meters = vp_to_stop.stop_meters.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28e1a479-79fe-4ae1-8921-02d872257213", + "metadata": {}, + "outputs": [], + "source": [ + "this_stop_direction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11360fa7-de28-4f62-aa43-da65c61b1670", + "metadata": {}, + "outputs": [], + "source": [ + "opposite_to_stop_direction = wrangle_shapes.OPPOSITE_DIRECTIONS[\n", + " this_stop_direction]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19fde734-15f8-4968-9637-014f6f80f469", + "metadata": {}, + "outputs": [], + "source": [ + "opposite_to_stop_direction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "008612d6-e8db-4ea3-b2ff-64d94976381a", + "metadata": {}, + "outputs": [], + "source": [ + "vp_dir_array = np.asarray(vp_to_stop.vp_dir_arr.iloc[0])\n", + "shape_meters_array = np.asarray(vp_to_stop.shape_meters_arr.iloc[0])\n", + "vp_meters_array = np.asarray(vp_to_stop.vp_idx_arr.iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e668c5c3-2d0b-4039-b1f9-6f1cfc2896fd", + "metadata": {}, + "outputs": [], + "source": [ + "#https://stackoverflow.com/questions/16094563/numpy-get-index-where-value-is-true\n", + "valid_vp_idx_indices = (vp_dir_array != opposite_to_stop_direction).nonzero()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1ebd663-254e-44ae-87ae-2cc90e527e62", + "metadata": {}, + "outputs": [], + "source": [ + "vp_meters_array[valid_vp_idx_indices]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e6c7bad-c77d-4695-b685-f5de12374490", + "metadata": {}, + "outputs": [], + "source": [ + "for row in vp_to_stop.tail(1).itertuples():\n", + " this_stop_meters = np.asarray(getattr(row, \"vp_idx_arr\"))[valid_vp_idx_indices]\n", + " #valid_stop_meters = this_stop_meters\n", + " #print(valid_stop_meters)\n", + " print(this_stop_meters)" + ] + }, + { + "cell_type": "markdown", + "id": "039ea93e-3950-467d-bdcf-398d004fbaac", + "metadata": {}, + "source": [ + "## Plot interpolated arrivals -> speed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "680172b7-7d24-42ab-8ef7-5336e33b9d41", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aba87999-00da-46b7-b839-d521fcb0eb7e", + "metadata": {}, + "outputs": [], + "source": [ + "bins = range(0, 75, 5)\n", + "\n", + "df[df.speed_mph < 80].hist(\n", + " \"speed_mph\", \n", + " bins = bins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0216fe6d-5e2a-4ba4-8aea-68a1c029593b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rt_segment_speeds/scripts/handle_common_errors.py b/rt_segment_speeds/scripts/handle_common_errors.py index 8285c9984..63939f503 100644 --- a/rt_segment_speeds/scripts/handle_common_errors.py +++ b/rt_segment_speeds/scripts/handle_common_errors.py @@ -65,4 +65,5 @@ def tag_common_errors(df: pd.DataFrame) -> pd.DataFrame: f"{SEGMENT_GCS}projection/{NEAREST_VP}.parquet" ).pipe(tag_common_errors) - + df.to_parquet( + f"{SEGMENT_GCS}projection/nearest_vp_error_{analysis_date}.parquet")