From ef6087866918336e228389e04fa97f0afbe1827c Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.chu@dot.ca.gov>
Date: Tue, 31 Oct 2023 17:47:44 +0000
Subject: [PATCH] look where arrival order is wrong

---
 .../25_interpolation_issues.ipynb             | 573 ++++++++++++++++++
 .../scripts/handle_common_errors.py           |   3 +-
 2 files changed, 575 insertions(+), 1 deletion(-)
 create mode 100644 rt_segment_speeds/25_interpolation_issues.ipynb

diff --git a/rt_segment_speeds/25_interpolation_issues.ipynb b/rt_segment_speeds/25_interpolation_issues.ipynb
new file mode 100644
index 000000000..6cf960cb3
--- /dev/null
+++ b/rt_segment_speeds/25_interpolation_issues.ipynb
@@ -0,0 +1,573 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0a0bbc47-639d-47f7-a051-d4a2f19f7dbc",
+   "metadata": {},
+   "source": [
+    "# Interpolation method - break down steps to debug\n",
+    "* When finding nearest vp, there are stops that have later arrival times coming before earlier arrivals.\n",
+    "   * stop 1 arrives after stop 2. this can't happen in real life.\n",
+    "   * filter down to these stops and try again\n",
+    "   * how do we distinguish between whether stop 1 or stop 2 is the \"correct\" one?\n",
+    "* Want to add direction to deal with loop or inlining shapes --> maybe this can be expanded to all shapes\n",
+    "   * Do not even allow opposite direction vp to be selected as nearest vp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef830f30-e979-4916-857e-08abe9762888",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dask.dataframe as dd\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from segment_speed_utils import helpers, wrangle_shapes\n",
+    "from segment_speed_utils.project_vars import SEGMENT_GCS, PROJECT_CRS \n",
+    "                                              \n",
+    "from shared_utils import rt_dates\n",
+    "\n",
+    "analysis_date = rt_dates.DATES[\"sep2023\"]\n",
+    "\n",
+    "STOP_SEG_DICT = helpers.get_parameters(\n",
+    "    \"./scripts/config.yml\", \"stop_segments\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5f369bb-68bf-46a2-86ad-6279872859b1",
+   "metadata": {},
+   "source": [
+    "## Between stops, how to find stops behaving not as expected\n",
+    "There are erroneous calculations here.\n",
+    "\n",
+    "Prior arrival time can't take place **after** arrival time. \n",
+    "\n",
+    "Ex: `trip_instance_key == \"0000412e74a0e91993ce66bcbc4e3e73\"`, stop 1's nearest position is after stop 2's nearest position.\n",
+    "\n",
+    "Handle these along with loop or inlining shapes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19614462-03ca-4a84-a119-3a38ca719b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NEAREST_VP = f\"{STOP_SEG_DICT['stage2']}_error_{analysis_date}\"\n",
+    "\n",
+    "df = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}projection/{NEAREST_VP}.parquet\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3077ade-87c1-4b9d-8cf7-bbb743a03823",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.error_arrival_order.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d0373a4-80fc-49e1-bac3-2edd8c5ae4d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.error_same_endpoints.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11d2a032-83db-43b7-a7fb-9254a10ae524",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[(df.error_same_endpoints==1) & \n",
+    "   (df.error_arrival_order==1)].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ea55941-8d10-49f2-8765-6b2faba4080d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trip_stats = (df.groupby(\"trip_instance_key\", \n",
+    "                         observed=True, group_keys=False)\n",
+    "              .agg({\n",
+    "                  \"error_same_endpoints\": \"mean\",\n",
+    "                  \"error_arrival_order\": \"mean\"\n",
+    "              }).reset_index()\n",
+    "             )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83fde1e4-29b1-43ab-b30c-f98ec63a87c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Very few trips are completely error-free\n",
+    "trip_stats[(trip_stats.error_same_endpoints==0) & \n",
+    "           (trip_stats.error_arrival_order==0)].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bd9d1b1-48f0-46e1-9875-181ea20df66a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#trip_stats.sample(10).trip_instance_key.unique()\n",
+    "subset_trip_keys = [\n",
+    "    '9fad69264acd8387150f45b27d4b2d09',\n",
+    "    '44a55d2fa2588a479065ef7702475ef1',\n",
+    "    '36070a2428e62b96368d072eb2a8fc1b',\n",
+    "    '7f665900c6b0879f4b9bda43b93fefe3',\n",
+    "    '8e8ba9993d52388539d06a46710c1dbc',\n",
+    "    'b301c2170c1ca49bbc1a9b600cccf643',\n",
+    "    '9373f5b0de977a718dea50fd90443619',\n",
+    "    '8415b3949147c9dc3d5ceb37863440b1',\n",
+    "    '984f598419c1d0830ef4618d495c1bd7',\n",
+    "    '815e4dd921cdcb61ad2dbb1ca5f08a39'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f546357f-fdeb-4ece-bbd6-f3b9b5f53c30",
+   "metadata": {},
+   "source": [
+    "If almost every trip has errors, it matters where the arrival order error comes in. Must take the minimum stop sequence of a clear monotonic period and set that as the max to filter out vp_idx that occur afterwards."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f172f124-8221-4d65-9a06-f5cdb2e15120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import altair as alt\n",
+    "\n",
+    "for t in subset_trip_keys:\n",
+    "    \n",
+    "    chart = (alt.Chart(df[df.trip_instance_key==t])\n",
+    "             .mark_line()\n",
+    "             .encode(\n",
+    "                 x=\"stop_sequence\",\n",
+    "                 y=\"error_arrival_order\"\n",
+    "             ).properties(title=f\"{t}\")\n",
+    "    )\n",
+    "    display(chart)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcdc9773-fed6-4b81-9ccb-c868dca01535",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24a28b40-0548-412e-b21e-de7c455e7708",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nearest_array = df.groupby(\"trip_instance_key\").agg(\n",
+    "    {\"nearest_vp_idx\": lambda x: list(x)}).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3543a028-58a2-42e8-9bdb-cda4d6e88f45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nearest_array = nearest_array.assign(\n",
+    "    array_diff = nearest_array.apply(\n",
+    "        lambda x: \n",
+    "        np.ediff1d(np.asarray(x.nearest_vp_idx)),\n",
+    "        axis=1)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "702ea4a1-92f9-4adf-940b-b43505692613",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nearest_array = nearest_array.assign(\n",
+    "    wrong_times = nearest_array.apply(\n",
+    "        lambda x: 1 if len(np.where(x.array_diff < 0)[0] > 0)\n",
+    "        else 0, axis=1\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a14c62ce-3f27-4c09-bb67-91113ba0afd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nearest_array.wrong_times.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6324f8c7-4b80-4f36-801d-babc482dc1e3",
+   "metadata": {},
+   "source": [
+    "## Index into specific portions of array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90e7c452-9054-4f61-8c1f-c78526a5fd23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "one_trip = \"bf87a17838cdaff5ba78fb70edd4f1bb\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4779281-aa0d-4f12-82d8-7f8654ae491d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def rt_trips_to_shape(analysis_date: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Filter down trip_instance_keys from schedule to \n",
+    "    trips present in vp.\n",
+    "    Provide shape_array_key associated with trip_instance_key.\n",
+    "    \"\"\"\n",
+    "    # Get RT trips\n",
+    "    rt_trips = pd.read_parquet(\n",
+    "        f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n",
+    "        filters = [[(\"trip_instance_key\", \"==\", one_trip)]],\n",
+    "        columns = [\"trip_instance_key\"]\n",
+    "    ).drop_duplicates()\n",
+    "\n",
+    "    # Find the shape_array_key for RT trips\n",
+    "    trip_to_shape = helpers.import_scheduled_trips(\n",
+    "        analysis_date,\n",
+    "        columns = [\"trip_instance_key\", \"shape_array_key\"],\n",
+    "        get_pandas = True\n",
+    "    ).merge(\n",
+    "        rt_trips,\n",
+    "        on = \"trip_instance_key\",\n",
+    "        how = \"inner\"\n",
+    "    )\n",
+    "\n",
+    "    # Find whether it's loop or inlining\n",
+    "    shapes_loop_inlining = pd.read_parquet(\n",
+    "        f\"{SEGMENT_GCS}stops_projected_{analysis_date}.parquet\",\n",
+    "        columns = [\n",
+    "            \"shape_array_key\", \"loop_or_inlining\", \n",
+    "            \"stop_primary_direction\", \n",
+    "        ]\n",
+    "    ).drop_duplicates().merge(\n",
+    "        trip_to_shape,\n",
+    "        on = \"shape_array_key\",\n",
+    "        how = \"inner\"\n",
+    "    )\n",
+    "    \n",
+    "    return shapes_loop_inlining"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f3e202c-2854-4874-8f09-8074c3025f47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trip_shape_crosswalk = rt_trips_to_shape(analysis_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c332ddfa-90d9-4a3f-bc81-005d1e277422",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vp = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}vp_usable_{analysis_date}\",\n",
+    "    filters = [[(\"trip_instance_key\", \"==\", one_trip)]],\n",
+    "    columns = [\"trip_instance_key\", \"vp_idx\", \n",
+    "                \"vp_primary_direction\", \n",
+    "                  ]\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae73eb2d-ac83-405a-9aab-9143ec8874b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subset_vp = vp.vp_idx.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3172933b-6d1b-4ef7-af50-5137c638b72b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projected_shape_meters = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}projection/vp_projected_{analysis_date}.parquet\",\n",
+    "    filters = [[(\"vp_idx\", \"in\", subset_vp)]]\n",
+    ")\n",
+    "\n",
+    "vp_with_projection = pd.merge(\n",
+    "    vp,\n",
+    "    projected_shape_meters,\n",
+    "    on = \"vp_idx\",\n",
+    "    how = \"inner\"\n",
+    ").merge(\n",
+    "    trip_shape_crosswalk,\n",
+    "        on = \"trip_instance_key\",\n",
+    "        how = \"inner\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8687b8b-5550-484b-a080-aa8f787e95be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shape_keys = trip_shape_crosswalk.shape_array_key.unique()\n",
+    "\n",
+    "stops_projected = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}stops_projected_{analysis_date}.parquet\",\n",
+    "    filters = [[(\"shape_array_key\", \"in\", shape_keys)]],\n",
+    "    columns = [\"shape_array_key\", \"stop_sequence\", \"stop_id\", \n",
+    "               \"shape_meters\", \"stop_primary_direction\"]\n",
+    ").rename(columns = {\"shape_meters\": \"stop_meters\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edac7256-c5ca-4bef-bf71-573b3a4d30c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trip_shape_cols = [\"trip_instance_key\", \"shape_array_key\"]\n",
+    "\n",
+    "trip_info = (\n",
+    "    vp_with_projection\n",
+    "    .groupby(trip_shape_cols, \n",
+    "              observed=True, group_keys=False)\n",
+    "    .agg({\n",
+    "        \"vp_idx\": lambda x: list(x),\n",
+    "        \"shape_meters\": lambda x: list(x),\n",
+    "        \"vp_primary_direction\": lambda x: list(x),\n",
+    "    })\n",
+    "    .reset_index()\n",
+    "    .rename(columns = {\n",
+    "        \"vp_idx\": \"vp_idx_arr\",\n",
+    "        \"shape_meters\": \"shape_meters_arr\",\n",
+    "        \"vp_primary_direction\": \"vp_dir_arr\"\n",
+    "    })\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ecfba640-5c25-4866-9b6c-3c32dcb8ec63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vp_to_stop = pd.merge(\n",
+    "    trip_info,\n",
+    "    stops_projected,\n",
+    "    on = \"shape_array_key\",\n",
+    "    how = \"inner\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5aeaad1d-a191-49d9-b0d1-c5fe14f4ade5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "this_stop_direction = vp_to_stop.stop_primary_direction.iloc[0]\n",
+    "this_stop_meters = vp_to_stop.stop_meters.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28e1a479-79fe-4ae1-8921-02d872257213",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "this_stop_direction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11360fa7-de28-4f62-aa43-da65c61b1670",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opposite_to_stop_direction = wrangle_shapes.OPPOSITE_DIRECTIONS[\n",
+    "    this_stop_direction]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19fde734-15f8-4968-9637-014f6f80f469",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opposite_to_stop_direction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "008612d6-e8db-4ea3-b2ff-64d94976381a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vp_dir_array = np.asarray(vp_to_stop.vp_dir_arr.iloc[0])\n",
+    "shape_meters_array = np.asarray(vp_to_stop.shape_meters_arr.iloc[0])\n",
+    "vp_meters_array = np.asarray(vp_to_stop.vp_idx_arr.iloc[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e668c5c3-2d0b-4039-b1f9-6f1cfc2896fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#https://stackoverflow.com/questions/16094563/numpy-get-index-where-value-is-true\n",
+    "valid_vp_idx_indices = (vp_dir_array != opposite_to_stop_direction).nonzero()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1ebd663-254e-44ae-87ae-2cc90e527e62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vp_meters_array[valid_vp_idx_indices]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e6c7bad-c77d-4695-b685-f5de12374490",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for row in vp_to_stop.tail(1).itertuples():\n",
+    "    this_stop_meters = np.asarray(getattr(row, \"vp_idx_arr\"))[valid_vp_idx_indices]\n",
+    "    #valid_stop_meters = this_stop_meters\n",
+    "    #print(valid_stop_meters)\n",
+    "    print(this_stop_meters)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "039ea93e-3950-467d-bdcf-398d004fbaac",
+   "metadata": {},
+   "source": [
+    "## Plot interpolated arrivals -> speed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "680172b7-7d24-42ab-8ef7-5336e33b9d41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet(\n",
+    "    f\"{SEGMENT_GCS}stop_arrivals_speed_{analysis_date}_2.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aba87999-00da-46b7-b839-d521fcb0eb7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = range(0, 75, 5)\n",
+    "\n",
+    "df[df.speed_mph < 80].hist(\n",
+    "    \"speed_mph\", \n",
+    "     bins = bins)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0216fe6d-5e2a-4ba4-8aea-68a1c029593b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rt_segment_speeds/scripts/handle_common_errors.py b/rt_segment_speeds/scripts/handle_common_errors.py
index 8285c9984..63939f503 100644
--- a/rt_segment_speeds/scripts/handle_common_errors.py
+++ b/rt_segment_speeds/scripts/handle_common_errors.py
@@ -65,4 +65,5 @@ def tag_common_errors(df: pd.DataFrame) -> pd.DataFrame:
         f"{SEGMENT_GCS}projection/{NEAREST_VP}.parquet"
     ).pipe(tag_common_errors)
     
-    
+    df.to_parquet(
+        f"{SEGMENT_GCS}projection/nearest_vp_error_{analysis_date}.parquet")