From a746ecbdba9f6f24e64269eec0c5e0cbffd18c3d Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Mon, 29 Apr 2024 14:22:51 -0400 Subject: [PATCH 1/6] synced with main, cleaned outputs --- .../01_extract_db_data.ipynb | 1645 +++++++++++++++++ .../02_run_trip_level_models.py | 491 +++++ .../03_user_level_models.ipynb | 1120 +++++++++++ .../04_FeatureClustering.ipynb | 1108 +++++++++++ replacement_mode_modeling/README.md | 31 + replacement_mode_modeling/data/README.md | 1 + replacement_mode_modeling/outputs/README.md | 1 + 7 files changed, 4397 insertions(+) create mode 100644 replacement_mode_modeling/01_extract_db_data.ipynb create mode 100644 replacement_mode_modeling/02_run_trip_level_models.py create mode 100644 replacement_mode_modeling/03_user_level_models.ipynb create mode 100644 replacement_mode_modeling/04_FeatureClustering.ipynb create mode 100644 replacement_mode_modeling/README.md create mode 100644 replacement_mode_modeling/data/README.md create mode 100644 replacement_mode_modeling/outputs/README.md diff --git a/replacement_mode_modeling/01_extract_db_data.ipynb b/replacement_mode_modeling/01_extract_db_data.ipynb new file mode 100644 index 0000000..216b88b --- /dev/null +++ b/replacement_mode_modeling/01_extract_db_data.ipynb @@ -0,0 +1,1645 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "38b147ff", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import ast\n", + "import sys\n", + "import pickle\n", + "import importlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from pandas.api.types import is_string_dtype\n", + "from pathlib import Path\n", + "from uuid import UUID\n", + "from collections import defaultdict\n", + "\n", + "pd.set_option(\"display.max_columns\", 100)\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e550aa2b", + "metadata": {}, + "outputs": [], + "source": [ + "INCLUDE_TEST_USERS = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39306a1d", + "metadata": {}, + "outputs": [], + "source": [ + "# Add path to your emission server here.\n", + "emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'\n", + "sys.path.append(str(emission_path))\n", + "\n", + "# Also add the home (viz_scripts) to the path\n", + "sys.path.append('../viz_scripts')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94f673d6", + "metadata": {}, + "outputs": [], + "source": [ + "import scaffolding\n", + "import emission.core.get_database as edb\n", + "import emission.storage.timeseries.abstract_timeseries as esta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e171e277", + "metadata": {}, + "outputs": [], + "source": [ + "DB_SOURCE = [\n", + " \"Stage_database\", # Does NOT have composite trips BUT has section modes and distances\n", + " \"openpath_prod_durham\", # Has composite trips\n", + " \"openpath_prod_mm_masscec\", # Has composite trips\n", + " \"openpath_prod_ride2own\", # Has composite trips\n", + "# \"openpath_prod_uprm_civic\", # No replaced mode (Excluded)\n", + " \"openpath_prod_uprm_nicr\" # Has composite trips\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70fa3112", + "metadata": {}, + "outputs": [], + "source": [ + "CURRENT_DB = DB_SOURCE[0]\n", + "\n", + "assert CURRENT_DB in DB_SOURCE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbde79d1", + "metadata": {}, + "outputs": [], + "source": [ + "REPLACED_MODE_DICT = {\n", + " \"Stage_database\": {\n", + " 'no_trip': 'no_trip',\n", + " 'no_travel': 'no_trip',\n", + " 'Unknown': 'unknown',\n", + " 'unknown': 'unknown',\n", + " 'bus': 'transit',\n", + " 'drove_alone': 'car',\n", + " 'bike': 'p_micro',\n", + " 'shared_ride': 's_car',\n", + " 'walk': 'walk',\n", + " 'train': 'transit',\n", + " 'bikeshare': 's_micro',\n", + " 'not_a trip': 'no_trip',\n", + " 'pilot_ebike': 'p_micro',\n", + " 'electric_car': 'car',\n", + " 'taxi': 'ridehail',\n", + " 'not_a_trip': 'no_trip',\n", + " 'run': 'walk',\n", + " 'scootershare': 's_micro',\n", + " 'tramway': 'transit',\n", + " 'free_shuttle': 'transit',\n", + " 'e-bike': 'p_micro',\n", + " 'rental_car': 'car',\n", + " 'train_+ bus': 'transit',\n", + " 'skateboard': 'p_micro',\n", + " 'snowboarding': 'p_micro',\n", + " 'e_bike': 'p_micro',\n", + " 'golf_cart': 'unknown',\n", + " 'emergency_vehicle with others': 's_car',\n", + " 'call_friend': 's_car',\n", + " 'no_replacement': 'no_travel',\n", + " 'doing_nothing': 'no_trip',\n", + " 'na': 'no_trip',\n", + " 'ebike': 'p_micro',\n", + " 'hiking': 'walk',\n", + " 'n/a': 'no_trip',\n", + " 'testing': 'unknown',\n", + " 'home': 'no_trip',\n", + " 'must_walk 3-5 mi a day for back': 'walk',\n", + " 'family': 's_car',\n", + " 'car': 'car',\n", + " 'pilot_e-bike': 'p_micro',\n", + " 'pilot_bike': 'p_micro',\n", + " 'time_spent on the clock at amazon': 'no_trip',\n", + " 'working': 'no_trip',\n", + " 'walk_at work': 'walk',\n", + " 'sitting_on my butt doing nothing': 'no_trip',\n", + " 'nothing._delivered food for work': 'no_trip',\n", + " 'train,_bus and walk': 'transit',\n", + " 'work_vehicle': 'car',\n", + " 'friend_picked me up': 's_car',\n", + " 'ski': 'p_micro',\n", + " 'not_accurate': 'unknown',\n", + " 'stolen_ebike': 'p_micro'\n", + " },\n", + " \"openpath_prod_durham\": {\n", + " 'Unknown': 'unknown',\n", + " 'bike': 'p_micro',\n", + " 'shared_ride': 's_car',\n", + " 'drove_alone': 'car',\n", + " 'bus': 'transit',\n", + " 'no_travel': 'no_trip',\n", + " 'scootershare': 's_micro',\n", + " 'walk': 'walk',\n", + " 'taxi': 'ridehail',\n", + " 'e_car_drove_alone': 'car',\n", + " 'bikeshare': 's_micro',\n", + " 'ebike': 'p_micro',\n", + " 'train': 'transit',\n", + " 'e_car_shared_ride': 's_car'\n", + " },\n", + " \"openpath_prod_mm_masscec\": {\n", + " 'Unknown': 'unknown',\n", + " 'drove_alone': 'car',\n", + " 'walk': 'walk',\n", + " 'shared_ride': 's_car',\n", + " 'bike': 'p_micro',\n", + " 'bikeshare': 's_micro',\n", + " 'no_travel': 'no_trip',\n", + " 'taxi': 'ridehail',\n", + " 'bus': 'transit',\n", + " 'scootershare': 's_micro',\n", + " 'train': 'transit',\n", + " 'walking': 'walk',\n", + " 'e_car_drove_alone': 'car'\n", + " },\n", + " \"openpath_prod_ride2own\": {\n", + " 'Unknown': 'unknown',\n", + " 'drove_alone': 'car',\n", + " 'walk': 'walk',\n", + " 'shared_ride': 's_car',\n", + " 'bike': 'p_micro',\n", + " 'no_travel': 'no_trip',\n", + " 'taxi': 'ridehail',\n", + " 'bus': 'transit',\n", + " 'train': 'transit',\n", + " 'e_car_drove_alone': 'car',\n", + " 'e_car_shared_ride': 's_car'\n", + " },\n", + " \"openpath_prod_uprm_nicr\": {\n", + " 'Unknown': 'unknown',\n", + " 'walk': 'walk',\n", + " 'drove_alone': 'car'\n", + " }\n", + "}\n", + "\n", + "SENSED_SECTION_DICT = {\n", + " \"openpath_prod_mm_masscec\": {'AIR_OR_HSR', 'BICYCLING', 'BUS', 'CAR', 'LIGHT_RAIL', 'SUBWAY', 'TRAIN', 'UNKNOWN', 'WALKING'}\n", + "}\n", + "\n", + "SURVEY_DATA_DICT = {\n", + " \"Stage_database\": {\n", + " \"Unique User ID (auto-filled, do not edit)\": \"user_id\",\n", + " \"In which year were you born?\": \"birth_year\",\n", + " \"What is your gender?\": \"gender\",\n", + " \"Do you have a valid driver's license?\": \"has_drivers_license\",\n", + " \"Are you a student?\": \"is_student\",\n", + " \"What is the highest grade or degree that you have completed?\": \"highest_education\",\n", + " \"Do you work for either pay or profit?\": \"is_paid\",\n", + " \"Do you have more than one job?\": \"has_multiple_jobs\",\n", + " \"Do you work full-time or part-time at your primary job?\": \"primary_job_type\",\n", + " \"Which best describes your primary job?\": \"primary_job_description\",\n", + " \"How did you usually get to your primary job last week? \": \"primary_job_commute_mode\",\n", + " \"Thinking about your daily commute to work last week, how many minutes did it usually take to get from home to the primary job/work place?\": \"primary_job_commute_time\",\n", + " \"At your primary job, do you have the ability to set or change your own start time?\": \"is_primary_job_flexible\",\n", + " \"Do you have the option of working from home or an alternate location instead of going into your primary work place?\": \"primary_job_can_wfh\",\n", + " \"How many days per week do you usually work from home or an alternate location?\": \"wfh_days\",\n", + " \"Do you own or rent your place of residence?\": \"residence_ownership_type\",\n", + " \"What is your home type?\": \"residence_type\",\n", + " \"Please identify which category represents your total household income, before taxes, for last year.\": \"income_category\",\n", + " \"Including yourself, how many people live in your home?\": \"n_residence_members\",\n", + " \"How many children under age 18 live in your home?\": \"n_residents_u18\",\n", + " \"Including yourself, how many people have a driver's license in your household?\": \"n_residents_with_license\",\n", + " \"How many motor vehicles are owned, leased, or available for regular use by the people who currently live in your household?\": \"n_motor_vehicles\",\n", + " \"If you were unable to use your household vehicle(s), which of the following options would be available to you to get you from place to place?\": \"available_modes\",\n", + " \"Do you have a medical condition that makes it difficult to travel outside of the home?\": \"has_medical_condition\",\n", + " \"How long have you had this condition?\": \"medical_condition_duration\"\n", + " },\n", + " # Retrieved from: e-mission-phone/survey-resources/data-xls/demo-survey-v1.xlsx\n", + " \"openpath_prod_durham\": {\n", + " \"At_your_primary_job_do_you_ha\": \"is_primary_job_flexible\",\n", + " \"Which_best_describes_your_prim\": \"primary_job_description\",\n", + " \"Do_you_work_full_time_or_part_\": \"primary_job_type\",\n", + " \"Do_you_have_the_option_of_work\": \"primary_job_can_wfh\",\n", + " \"Please_describe_your_primary_job\": \"primary_job_description_2\",\n", + " \"Do_you_have_more_than_one_job\": \"has_multiple_jobs\",\n", + " # Two columns: how many days/week do you work & what days of the week do you work. \n", + " # the latter has only 4 NA values, the former has 45 NA values.\n", + " \"What_days_of_the_week_do_you_t\": \"wfh_days\",\n", + " \"How_many_days_do_you_usually_w_001\": \"n_wfh_days\",\n", + " # All these are NAs.\n", + " \"Which_one_below_describe_you_b\": \"description\",\n", + " \"What_is_your_race_ethnicity\": \"race_or_ethnicity\",\n", + " \"Are_you_a_student\": \"is_student\",\n", + " \"What_is_the_highest_grade_or_d\": \"highest_education\",\n", + " \"do_you_consider_yourself_to_be\": \"is_transgender\",\n", + " \"What_is_your_gender\": \"gender\",\n", + " \"How_old_are_you\": \"age\",\n", + " \"Are_you_a_paid_worker\": \"is_paid\",\n", + " \"Do_you_have_a_driver_license\": \"has_drivers_license\",\n", + " \"How_long_you_had_this_conditio\": \"medical_condition_duration\",\n", + " \"Including_yourself_how_many_w_001\": \"n_residents_u18\",\n", + " \"Including_yourself_how_many_p\": \"n_residence_members\",\n", + " \"Do_you_own_or_rent_your_home\": \"residence_ownership_type\",\n", + " \"Please_identify_which_category\": \"income_category\",\n", + " \"If_you_were_unable_to_use_your\": \"available_modes\",\n", + " \"Including_yourself_how_many_p_001\": \"n_residents_with_license\",\n", + " \"Including_yourself_how_many_w\": \"n_working_residents\",\n", + " \"What_is_your_home_type\": \"residence_type\",\n", + " \"How_many_motor_vehicles_are_ow\": \"n_motor_vehicles\",\n", + " \"Do_you_have_a_condition_or_han\": \"has_medical_condition\"\n", + " },\n", + " \"openpath_prod_mm_masscec\": {\n", + " # Same questions as Durham.\n", + " \"At_your_primary_job_do_you_ha\": \"is_primary_job_flexible\",\n", + " \"Which_best_describes_your_prim\": \"primary_job_description\",\n", + " \"Do_you_work_full_time_or_part_\": \"primary_job_type\",\n", + " \"Do_you_have_the_option_of_work\": \"primary_job_can_wfh\",\n", + " \"Please_describe_your_primary_job\": \"primary_job_description_2\",\n", + " \"Do_you_have_more_than_one_job\": \"has_multiple_jobs\",\n", + " # Two columns: how many days/week do you work & what days of the week do you work. \n", + " # the latter has only 4 NA values, the former has 45 NA values.\n", + " \"What_days_of_the_week_do_you_t\": \"wfh_days\",\n", + " \"How_many_days_do_you_usually_w_001\": \"n_wfh_days\",\n", + " # All these are NAs.\n", + " \"Which_one_below_describe_you_b\": \"description\",\n", + " \"What_is_your_race_ethnicity\": \"race_or_ethnicity\",\n", + " \"Are_you_a_student\": \"is_student\",\n", + " \"What_is_the_highest_grade_or_d\": \"highest_education\",\n", + " \"do_you_consider_yourself_to_be\": \"is_transgender\",\n", + " \"What_is_your_gender\": \"gender\",\n", + " \"How_old_are_you\": \"age\",\n", + " \"Are_you_a_paid_worker\": \"is_paid\",\n", + " \"Do_you_have_a_driver_license\": \"has_drivers_license\",\n", + " \"How_long_you_had_this_conditio\": \"medical_condition_duration\",\n", + " \"Including_yourself_how_many_w_001\": \"n_residents_u18\",\n", + " \"Including_yourself_how_many_p\": \"n_residence_members\",\n", + " \"Do_you_own_or_rent_your_home\": \"residence_ownership_type\",\n", + " \"Please_identify_which_category\": \"income_category\",\n", + " \"If_you_were_unable_to_use_your\": \"available_modes\",\n", + " \"Including_yourself_how_many_p_001\": \"n_residents_with_license\",\n", + " \"Including_yourself_how_many_w\": \"n_working_residents\",\n", + " \"What_is_your_home_type\": \"residence_type\",\n", + " \"How_many_motor_vehicles_are_ow\": \"n_motor_vehicles\",\n", + " \"Do_you_have_a_condition_or_han\": \"has_medical_condition\"\n", + " },\n", + " \"openpath_prod_ride2own\": {\n", + " # Same questions as Durham.\n", + " \"How_old_are_you\": \"age\",\n", + " \"What_is_your_gender\": \"gender\",\n", + " \"do_you_consider_yourself_to_be\": \"is_transgender\",\n", + " \"What_is_your_race_ethnicity\": \"race_or_ethnicity\",\n", + " \"Do_you_have_a_driver_license\": \"has_drivers_license\",\n", + " \"Are_you_a_student\": \"is_student\",\n", + " \"What_is_the_highest_grade_or_d\": \"highest_education\",\n", + " \"Are_you_a_paid_worker\": \"is_paid\",\n", + " \"Which_one_below_describe_you_b\": \"description\",\n", + " \"Do_you_own_or_rent_your_home\": \"residence_ownership_type\",\n", + " \"What_is_your_home_type\": \"residence_type\",\n", + " \"Please_identify_which_category\": \"income_category\",\n", + " \"Including_yourself_how_many_p\": \"n_residence_members\",\n", + " \"Including_yourself_how_many_w\": \"n_working_residents\",\n", + " \"Including_yourself_how_many_p_001\": \"n_residents_with_license\",\n", + " \"Including_yourself_how_many_w_001\": \"n_residents_u18\",\n", + " \"How_many_motor_vehicles_are_ow\": \"n_motor_vehicles\",\n", + " \"If_you_were_unable_to_use_your\": \"available_modes\",\n", + " \"Do_you_have_a_condition_or_han\": \"has_medical_condition\",\n", + " \"How_long_you_had_this_conditio\": \"medical_condition_duration\",\n", + " \"Do_you_have_more_than_one_job\": \"has_multiple_jobs\",\n", + " \"Do_you_work_full_time_or_part_\": \"primary_job_type\",\n", + " \"Which_best_describes_your_prim\": \"primary_job_description\",\n", + " \"Please_describe_your_primary_job\": \"primary_job_description_2\",\n", + " \"At_your_primary_job_do_you_ha\": \"is_primary_job_flexible\",\n", + " \"Do_you_have_the_option_of_work\": \"primary_job_can_wfh\",\n", + " \"How_many_days_do_you_usually_w_001\": \"n_wfh_days\",\n", + " \"What_days_of_the_week_do_you_t\": \"wfh_days\"\n", + " },\n", + " \"openpath_prod_uprm_nicr\": {\n", + " # Same as Durham!\n", + " \"At_your_primary_job_do_you_ha\": \"is_primary_job_flexible\",\n", + " \"Which_best_describes_your_prim\": \"primary_job_description\",\n", + " \"Do_you_work_full_time_or_part_\": \"primary_job_type\",\n", + " \"Do_you_have_the_option_of_work\": \"primary_job_can_wfh\",\n", + " \"Please_describe_your_primary_job\": \"primary_job_description_2\",\n", + " \"Do_you_have_more_than_one_job\": \"has_multiple_jobs\",\n", + " # Two columns: how many days/week do you work & what days of the week do you work. \n", + " # the latter has only 4 NA values, the former has 45 NA values.\n", + " \"What_days_of_the_week_do_you_t\": \"wfh_days\",\n", + " \"How_many_days_do_you_usually_w_001\": \"n_wfh_days\",\n", + " # All these are NAs.\n", + " \"Which_one_below_describe_you_b\": \"description\",\n", + " \"What_is_your_race_ethnicity\": \"race_or_ethnicity\",\n", + " \"Are_you_a_student\": \"is_student\",\n", + " \"What_is_the_highest_grade_or_d\": \"highest_education\",\n", + " \"do_you_consider_yourself_to_be\": \"is_transgender\",\n", + " \"What_is_your_gender\": \"gender\",\n", + " \"How_old_are_you\": \"age\",\n", + " \"Are_you_a_paid_worker\": \"is_paid\",\n", + " \"Do_you_have_a_driver_license\": \"has_drivers_license\",\n", + " \"How_long_you_had_this_conditio\": \"medical_condition_duration\",\n", + " \"Including_yourself_how_many_w_001\": \"n_residents_u18\",\n", + " \"Including_yourself_how_many_p\": \"n_residence_members\",\n", + " \"Do_you_own_or_rent_your_home\": \"residence_ownership_type\",\n", + " \"Please_identify_which_category\": \"income_category\",\n", + " \"If_you_were_unable_to_use_your\": \"available_modes\",\n", + " \"Including_yourself_how_many_p_001\": \"n_residents_with_license\",\n", + " \"Including_yourself_how_many_w\": \"n_working_residents\",\n", + " \"What_is_your_home_type\": \"residence_type\",\n", + " \"How_many_motor_vehicles_are_ow\": \"n_motor_vehicles\",\n", + " \"Do_you_have_a_condition_or_han\": \"has_medical_condition\"\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69008893", + "metadata": {}, + "outputs": [], + "source": [ + "## Source: db_utils.py in op-admin-dashboard.\n", + "\n", + "BINARY_DEMOGRAPHICS_COLS = [\n", + " 'user_id',\n", + " '_id',\n", + "]\n", + "\n", + "EXCLUDED_DEMOGRAPHICS_COLS = [\n", + " 'data.xmlResponse', \n", + " 'data.name',\n", + " 'data.version',\n", + " 'data.label',\n", + " 'xmlns:jr',\n", + " 'xmlns:orx',\n", + " 'id',\n", + " 'start',\n", + " 'end',\n", + " 'attrxmlns:jr',\n", + " 'attrxmlns:orx',\n", + " 'attrid',\n", + " '__version__',\n", + " 'attrversion',\n", + " 'instanceID',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12cc0c54", + "metadata": {}, + "outputs": [], + "source": [ + "## Source: scaffolding.py\n", + "\n", + "def expand_userinputs(labeled_ct):\n", + " '''\n", + " param: labeled_ct: a dataframe of confirmed trips, some of which have labels\n", + " params: labels_per_trip: the number of labels for each trip.\n", + " Currently, this is 2 for studies and 3 for programs, and should be \n", + " passed in by the notebook based on the input config.\n", + " If used with a trip-level survey, it could be even larger.\n", + " '''\n", + " # CASE 1 of https://github.com/e-mission/em-public-dashboard/issues/69#issuecomment-1256835867\n", + " if len(labeled_ct) == 0:\n", + " return labeled_ct\n", + " label_only = pd.DataFrame(labeled_ct.user_input.to_list(), index=labeled_ct.index)\n", + " # disp.display(label_only.head())\n", + " labels_per_trip = len(label_only.columns)\n", + " print(\"Found %s columns of length %d\" % (label_only.columns, labels_per_trip))\n", + " expanded_ct = pd.concat([labeled_ct, label_only], axis=1)\n", + " assert len(expanded_ct) == len(labeled_ct), \\\n", + " (\"Mismatch after expanding labels, expanded_ct.rows = %s != labeled_ct.rows %s\" %\n", + " (len(expanded_ct), len(labeled_ct)))\n", + " print(\"After expanding, columns went from %s -> %s\" %\n", + " (len(labeled_ct.columns), len(expanded_ct.columns)))\n", + " assert len(expanded_ct.columns) == len(labeled_ct.columns) + labels_per_trip, \\\n", + " (\"Mismatch after expanding labels, expanded_ct.columns = %s != labeled_ct.columns %s\" %\n", + " (len(expanded_ct.columns), len(labeled_ct.columns)))\n", + " # disp.display(expanded_ct.head())\n", + " return expanded_ct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a98e2fb", + "metadata": {}, + "outputs": [], + "source": [ + "## Source: scaffolding.py\n", + "\n", + "def data_quality_check(expanded_ct):\n", + " '''1. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was pilot_ebike.\n", + " 2. Delete rows where the mode_confirm was pilot_ebike and repalced_mode was same_mode.\n", + " 3. Replace same_mode for the mode_confirm for Energy Impact Calcualtion.'''\n", + "\n", + " # TODO: This is only really required for the initial data collection around the minipilot\n", + " # in subsequent deployes, we removed \"same mode\" and \"pilot_ebike\" from the options, so the\n", + " # dataset did not contain of these data quality issues\n", + "\n", + " if 'replaced_mode' in expanded_ct.columns:\n", + " expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'pilot_ebike')].index, inplace=True)\n", + " expanded_ct.drop(expanded_ct[(expanded_ct['mode_confirm'] == 'pilot_ebike') & (expanded_ct['replaced_mode'] == 'same_mode')].index, inplace=True)\n", + " expanded_ct['replaced_mode'] = np.where(expanded_ct['replaced_mode'] == 'same_mode',expanded_ct['mode_confirm'], expanded_ct['replaced_mode'])\n", + " \n", + " return expanded_ct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe37bf27", + "metadata": {}, + "outputs": [], + "source": [ + "if CURRENT_DB != \"Stage_database\":\n", + "\n", + " ## Source: scaffolding.py\n", + "\n", + " uuid_df = pd.json_normalize(list(edb.get_uuid_db().find()))\n", + "\n", + " if not INCLUDE_TEST_USERS:\n", + " uuid_df = uuid_df.loc[~uuid_df.user_email.str.contains('_test_'), :]\n", + "\n", + " filtered = uuid_df.uuid.unique()\n", + "\n", + " agg = esta.TimeSeries.get_aggregate_time_series()\n", + " all_ct = agg.get_data_df(\"analysis/confirmed_trip\", None)\n", + "\n", + " print(f\"Before filtering, length={len(all_ct)}\")\n", + " participant_ct_df = all_ct.loc[all_ct.user_id.isin(filtered), :]\n", + " print(f\"After filtering, length={len(participant_ct_df)}\")\n", + "\n", + " expanded_ct = expand_userinputs(participant_ct_df)\n", + " expanded_ct = data_quality_check(expanded_ct)\n", + " print(expanded_ct.columns.tolist())\n", + " expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unknown')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13536d14", + "metadata": {}, + "outputs": [], + "source": [ + "# # Additional preprocessing for replaced mode (if any)\n", + "\n", + "if CURRENT_DB != \"Stage_database\":\n", + "\n", + " mode_counts = expanded_ct['replaced_mode'].value_counts()\n", + " drop_modes = mode_counts[mode_counts == 1].index.tolist()\n", + "\n", + " expanded_ct.drop(\n", + " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(drop_modes)].index,\n", + " inplace=True\n", + " )\n", + "\n", + " # Additional modes to drop.\n", + " expanded_ct.drop(\n", + " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(\n", + " # Remove all rows with air, boat, or weird answers.\n", + " ['houseboat', 'gondola', 'airline_flight', 'aircraft', 'zoo', 'air',\n", + " 'airplane', 'boat', 'flight', 'plane', 'meal', 'lunch']\n", + " )].index,\n", + " inplace=True\n", + " )\n", + " \n", + " expanded_ct.replaced_mode = expanded_ct.replaced_mode.apply(lambda x: REPLACED_MODE_DICT[CURRENT_DB][x])" + ] + }, + { + "cell_type": "markdown", + "id": "258844f4", + "metadata": {}, + "source": [ + "# Demographic pre-processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7461a4d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Demographics\n", + "\n", + "if CURRENT_DB != \"Stage_database\":\n", + "\n", + " decoded_uuids = [str(x) for x in filtered]\n", + "\n", + " ## Source: query_demographics() in op-admin-dashboard.\n", + " ts = esta.TimeSeries.get_aggregate_time_series()\n", + " entries = list(ts.find_entries([\"manual/demographic_survey\"]))\n", + "\n", + " available_key = {}\n", + " for entry in entries:\n", + " survey_key = list(entry['data']['jsonDocResponse'].keys())[0]\n", + " if survey_key not in available_key:\n", + " available_key[survey_key] = []\n", + "\n", + " # Minor modification: Added user_id check to filter users.\n", + " if str(entry['user_id']) in decoded_uuids:\n", + " available_key[survey_key].append(entry)\n", + "\n", + " dataframes = {}\n", + " for key, json_object in available_key.items():\n", + " df = pd.json_normalize(json_object)\n", + " dataframes[key] = df\n", + "\n", + " for key, df in dataframes.items():\n", + " if not df.empty:\n", + " for col in BINARY_DEMOGRAPHICS_COLS:\n", + " if col in df.columns:\n", + " df[col] = df[col].apply(str) \n", + " columns_to_drop = [col for col in df.columns if col.startswith(\"metadata\")]\n", + " df.drop(columns= columns_to_drop, inplace=True) \n", + " df.columns=[col.rsplit('.',1)[-1] if col.startswith('data.jsonDocResponse.') else col for col in df.columns]\n", + " for col in EXCLUDED_DEMOGRAPHICS_COLS:\n", + " if col in df.columns:\n", + " df.drop(columns= [col], inplace=True)\n", + "\n", + " survey_data = pd.DataFrame() \n", + " for v in dataframes.values():\n", + " survey_data = pd.concat([survey_data, v], axis=0, ignore_index=True)\n", + "else:\n", + " # Read the demographics.\n", + " survey_data = pd.read_csv('./viz_scripts/Can Do Colorado eBike Program - en.csv')\n", + " survey_data.rename(columns={'Unique User ID (auto-filled, do not edit)': 'user_id'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe5a9dff", + "metadata": {}, + "outputs": [], + "source": [ + "if CURRENT_DB == \"Stage_database\":\n", + " \n", + " if os.path.exists('./data/cached_allceo_data.csv'):\n", + " \n", + " # Replace current instance of dataframe with the cached dataframe.\n", + " expanded_ct = pd.read_csv('./data/cached_allceo_data.csv')\n", + " expanded_ct.loc[expanded_ct.replaced_mode == 'no_travel', 'replaced_mode'] = 'no_trip'\n", + " else:\n", + " ## NOTE: Run this cell only if the cached CSV is not already available. It will take a LOT of time.\n", + " ## Benchmark timing: ~12 hours on a MacBook Pro (2017 model) with pandarallel, 4 workers.\n", + " \n", + " importlib.reload(scaffolding)\n", + " expanded_ct = scaffolding.get_section_durations(expanded_ct)\n", + " expanded_ct.to_csv('./data/cached_allceo_data.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6be751e", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(survey_data.user_id.unique()), len(expanded_ct.user_id.unique()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ebc87d8", + "metadata": {}, + "outputs": [], + "source": [ + "survey_data.rename(SURVEY_DATA_DICT[CURRENT_DB], axis='columns', inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "522b1362", + "metadata": {}, + "source": [ + "### Demographic data preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "336508c2", + "metadata": {}, + "outputs": [], + "source": [ + "print(survey_data.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29bc7996", + "metadata": {}, + "outputs": [], + "source": [ + "# gtg\n", + "survey_data['ft_job'] = survey_data.primary_job_type.apply(\n", + " lambda x: 1 if str(x).lower() == 'full_time' else 0\n", + ")\n", + "\n", + "# gtg\n", + "survey_data['multiple_jobs'] = survey_data.has_multiple_jobs.apply(\n", + " lambda x: 1 if str(x).lower() == 'yes' else 0\n", + ")\n", + "\n", + "# gtg\n", + "survey_data.loc[\n", + " survey_data.n_motor_vehicles.isin(\n", + " ['prefer_not_to_say', 'Prefer not to say / Prefiero no decir.']\n", + " ), 'n_motor_vehicles'\n", + "] = 0\n", + "survey_data.loc[survey_data.n_motor_vehicles.isin(['more_than_3', '4+', 'more_than_4']), 'n_motor_vehicles'] = 4\n", + "survey_data.n_motor_vehicles = survey_data.n_motor_vehicles.astype(int)\n", + "\n", + "# gtg\n", + "survey_data.has_drivers_license = survey_data.has_drivers_license.apply(\n", + " lambda x: 1 if str(x).lower() == 'yes' else 0\n", + ")\n", + "\n", + "survey_data.loc[survey_data.n_residents_u18 == 'prefer_not_to_say'] = 0\n", + "survey_data.n_residents_u18 = survey_data.n_residents_u18.astype(int)\n", + "\n", + "survey_data.loc[survey_data.n_residence_members == 'prefer_not_to_say'] = 0\n", + "survey_data.n_residence_members = survey_data.n_residence_members.astype(int)\n", + "\n", + "survey_data.loc[survey_data.n_residents_with_license == 'prefer_not_to_say'] = 0\n", + "survey_data.loc[survey_data.n_residents_with_license == 'more_than_4'] = 4\n", + "survey_data.n_residents_with_license = survey_data.n_residents_with_license.astype(int)\n", + "\n", + "# In allCEO, we see 50 & 9999. What??\n", + "survey_data = survey_data[\n", + " (survey_data.n_residence_members < 10) & (survey_data.n_residents_u18 < 10) & \n", + " (survey_data.n_residents_with_license < 10) & \n", + " (survey_data.n_residence_members - survey_data.n_residents_with_license > 0) &\n", + " (survey_data.n_residence_members - survey_data.n_residents_u18 > 0)\n", + "].reset_index(drop=True)\n", + "\n", + "# gtg\n", + "if CURRENT_DB != \"Stage_database\":\n", + " survey_data.n_working_residents = survey_data.n_working_residents.apply(\n", + " lambda x: 0 if x == 'prefer_not_to_say' else int(x)\n", + " )\n", + "else:\n", + " survey_data['n_working_residents'] = survey_data['n_residence_members'] - survey_data['n_residents_u18']\n", + " \n", + "survey_data = survey_data[survey_data.n_working_residents >= 0].reset_index(drop=True)\n", + "\n", + "# gtg\n", + "survey_data.is_paid = survey_data.is_paid.apply(lambda x: 1 if x == 'Yes' else 0)\n", + "\n", + "# gtg\n", + "survey_data.has_medical_condition = survey_data.has_medical_condition.apply(\n", + " lambda x: 1 if str(x).lower() == 'yes' else 0\n", + ")\n", + "\n", + "## gtg\n", + "survey_data.is_student.replace({\n", + " 'Not a student': 0, \n", + " 'Yes - Full Time College/University': 1,\n", + " 'Yes - Vocation/Technical/Trade School': 1,\n", + " 'Yes - K-12th Grade including GED': 1, \n", + " 'Work': 0, \n", + " 'No': 0,\n", + " 'Prefer not to say': 0,\n", + " 'Yes - Part-Time College/University': 1,\n", + " 'Taking prerequisites missing for grad program ': 1, \n", + " 'Graduate': 1,\n", + " 'Custodian': 0, \n", + " 'Work at csu': 0,\n", + " 'not_a_student': 0, \n", + " 'yes___vocation_technical_trade_school': 1,\n", + " 'yes___part_time_college_university': 1,\n", + " 'prefer_not_to_say': 0, \n", + " 'yes___k_12th_grade_including_ged': 1,\n", + " 'yes___full_time_college_university': 1\n", + "}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "aeb85637", + "metadata": {}, + "source": [ + "### Additinal Demographic Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c069bd2", + "metadata": {}, + "outputs": [], + "source": [ + "if CURRENT_DB == \"Stage_database\":\n", + " age = survey_data.birth_year.apply(\n", + " lambda x: 2024 - int(x) if int(x) > 100 else int(x)\n", + " )\n", + " \n", + " upper = age - (age % 5)\n", + " lower = upper + 5\n", + " new_col = (upper + 1).astype(str) + '___' + lower.astype(str) + '_years_old'\n", + " survey_data['age'] = new_col\n", + " \n", + " survey_data.loc[survey_data.age.isin([\n", + " '66___70_years_old', '76___80_years_old', '81___85_years_old'\n", + " ]), 'age'] = '__65_years_old'\n", + " \n", + " survey_data.drop(columns=['birth_year'], inplace=True)\n", + "\n", + "else:\n", + " survey_data = survey_data[survey_data.age != 0].reset_index(drop=True)\n", + "\n", + "if survey_data.columns.isin(['primary_job_commute_mode', 'primary_job_commute_time']).all():\n", + " survey_data.drop(columns=['primary_job_commute_mode', 'primary_job_commute_time'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f094cadd", + "metadata": {}, + "outputs": [], + "source": [ + "def normalize_job_descriptions(db_name, df):\n", + " if db_name != 'Stage_database':\n", + " PRIMARY_JOB_DESCRIPTION_DICT = {\n", + " \"sales_or_service\": \"Sales or service\",\n", + " \"other\": \"Other\",\n", + " \"\": \"Other\",\n", + " \"professional__managerial__or_technical\": \"Professional, Manegerial, or Technical\",\n", + " \"manufacturing__construction__maintenance\": \"Manufacturing, construction, maintenance, or farming\",\n", + " \"clerical_or_administrative_support\": \"Clerical or administrative support\",\n", + " \"prefer_not_to_say\": \"Prefer not to say\",\n", + " }\n", + " \n", + " df.primary_job_description = df.primary_job_description.apply(\n", + " lambda x: PRIMARY_JOB_DESCRIPTION_DICT[x]\n", + " )\n", + " else:\n", + " df.primary_job_description = df.primary_job_description.str.strip()\n", + "\n", + " # Normalize the job description. Inspired from the 'e-bike trips by occupation' \n", + " # plot in the CanBikeCo full pilot paper.\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Paraprofessional', 'Education', 'education/early childhood', 'Teacher',\n", + " 'Education non-profit manager', 'Scientific research', 'Research',\n", + " 'Preschool Tracher'\n", + " ]), 'primary_job_description'\n", + " ] = 'Education'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Custodian', 'Custodial', 'Csu custodian', 'Janitorial',\n", + " 'Custodial Maintanace'\n", + " ]), 'primary_job_description'\n", + " ] = 'Custodial'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Inbound cs', 'Accounting Technician', \n", + " 'Clerical'\n", + " ]), 'primary_job_description'\n", + " ] = 'Clerical or administrative support'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Restaurant manager', 'Transportaion Services',\n", + " ]), 'primary_job_description'\n", + " ] = 'Sales or service'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Pastry chef and line cook', 'Cook', 'Chef', 'Dining Services',\n", + " 'Food Service', 'Cooking', 'Residential Dining Services', 'Line Cook'\n", + " ]), 'primary_job_description'\n", + " ] = 'Food service'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'CNA', 'Caregiver/ Qmap', 'Health care', 'Nurse',\n", + " 'Healthcare', 'Medical', 'Medical field',\n", + " 'Family support'\n", + " ]), 'primary_job_description'\n", + " ] = 'Medical/healthcare'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Amazon', 'Hockey rink', 'Caregiver', 'Security', 'Nonprofit social work',\n", + " 'Therapeutic', 'Driver'\n", + " ]), 'primary_job_description'\n", + " ] = 'Other'\n", + "\n", + " df.loc[\n", + " df.primary_job_description.isin([\n", + " 'Hospital laundry', 'Matreal handler', 'Maintenance',\n", + " 'Co op laundry'\n", + " ]), 'primary_job_description'\n", + " ] = 'Manufacturing, construction, maintenance, or farming'\n", + "\n", + " df.loc[df.primary_job_description.isna(), 'primary_job_description'] = 'Other'\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bf37859", + "metadata": {}, + "outputs": [], + "source": [ + "INCOME_DICT = {\n", + " 'Stage_database': {\n", + " 'Prefer not to say': 0,\n", + " 'Less than $24,999': 1,\n", + " '$25,000-$49,999': 2,\n", + " '$50,000-$99,999': 3,\n", + " '$100,000 -$149,999': 4,\n", + " '$150,000-$199,999': 5,\n", + " '$150,000': 5,\n", + " '$150,000-$199,999': 6,\n", + " '$200,000 or more': 7\n", + " },\n", + " 'Others': {\n", + " 'prefer_not_to_say': 0, \n", + " 'less_than__24_999': 1,\n", + " '_25_000_to__49_999': 2,\n", + " '_50_000_to__99_999': 3,\n", + " '_100_000_to__149_999': 4,\n", + " '_150_000_to__199_999': 5\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42b3163a", + "metadata": {}, + "outputs": [], + "source": [ + "survey_data = normalize_job_descriptions(CURRENT_DB, survey_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe2b18b6", + "metadata": {}, + "outputs": [], + "source": [ + "if CURRENT_DB == 'Stage_database':\n", + " survey_data.income_category = survey_data.income_category.apply(\n", + " lambda x: INCOME_DICT['Stage_database'][x]\n", + " )\n", + "else:\n", + " survey_data.income_category = survey_data.income_category.apply(\n", + " lambda x: INCOME_DICT['Others'][x]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b36672b9", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "def generate_ohe_features(df, feature_name):\n", + " ohe = OneHotEncoder()\n", + " ohe.fit(df[[feature_name]])\n", + " return pd.DataFrame(\n", + " ohe.transform(df[[feature_name]]).todense(), \n", + " columns=ohe.get_feature_names_out(),\n", + " index=df.index\n", + " ), ohe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc8d1846", + "metadata": {}, + "outputs": [], + "source": [ + "survey_data.reset_index(drop=True, inplace=True)\n", + "\n", + "ohe_features = ['highest_education', 'primary_job_description', 'gender', 'age']\n", + "\n", + "for ohe in ohe_features:\n", + " df, _ = generate_ohe_features(survey_data, ohe)\n", + " survey_data = survey_data.merge(right=df, left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2d6f8c1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "to_drop = [\n", + " 'Timestamp', 'gender', 'highest_education', 'primary_job_type', 'primary_job_description', \n", + " 'primary_job_commute_mode', 'primary_job_commute_time', 'is_primary_job_flexible', \n", + " 'primary_job_can_wfh', 'wfh_days', 'Which one below describe you best?', 'residence_ownership_type', \n", + " 'residence_type', 'medical_condition_duration', 'has_multiple_jobs', 'age', '_id', 'data.ts',\n", + " 'primary_job_description_2', 'wfh_days', 'n_wfh_days', 'description', 'race_or_ethnicity', \n", + " 'highest_education', 'is_transgender', 'medical_condition_duration'\n", + "]\n", + "\n", + "for column in to_drop:\n", + " if column in survey_data.columns:\n", + " survey_data.drop(columns=[column], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "65039f73", + "metadata": {}, + "source": [ + "## Merge sensed data and demographics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7eb2e09", + "metadata": {}, + "outputs": [], + "source": [ + "# Additional preprocessing to filter unwanted users from sensed trips data.\n", + "expanded_ct['user_id_join'] = expanded_ct['user_id'].apply(lambda x: str(x).replace('-', ''))\n", + "survey_data['user_id_join'] = survey_data['user_id'].apply(lambda x: str(x).replace('-', ''))\n", + "\n", + "survey_data.rename(columns={'user_id': 'survey_user_id'}, inplace=True)\n", + "\n", + "common = set(expanded_ct.user_id_join.unique()).intersection(\n", + " set(survey_data.user_id_join.unique())\n", + ")\n", + "\n", + "filtered_trips = expanded_ct.loc[expanded_ct.user_id_join.isin(common), :].reset_index(drop=True)\n", + "filtered_survey = survey_data.loc[survey_data.user_id_join.isin(common), :].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53927d5f", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Just to double-check.\n", + "print(len(filtered_trips.user_id.unique()), len(filtered_survey.survey_user_id.unique()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daed8fb0", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the section_*_argmax.\n", + "\n", + "def compute_argmax(db: str, row):\n", + " \n", + " if db != 'Stage_database':\n", + " \n", + " sections = row['inferred_section_summary']\n", + "\n", + " if pd.isna(sections) or len(sections) == 0 or len(sections['distance']) == 0:\n", + " return row\n", + "\n", + " try:\n", + " mode = sorted(sections['distance'].items(), key=lambda x: x[-1], reverse=True)[0][0]\n", + " distance = sections['distance'][mode]\n", + " duration = sections['duration'][mode]\n", + "\n", + " row['section_mode_argmax'] = mode\n", + " row['section_distance_argmax'] = distance\n", + " row['section_duration_argmax'] = duration\n", + "\n", + " except:\n", + " row['section_mode_argmax'] = np.nan\n", + " row['section_distance_argmax'] = np.nan\n", + " row['section_duration_argmax'] = np.nan\n", + "\n", + " finally:\n", + " return row\n", + " else:\n", + " \n", + " try:\n", + " distances = ast.literal_eval(row['section_distances'])\n", + " durations = ast.literal_eval(row['section_durations'])\n", + " modes = ast.literal_eval(row['section_modes'])\n", + "\n", + " argmax = np.argmax(distances)\n", + " \n", + " row['section_distance_argmax'] = distances[argmax]\n", + " row['section_duration_argmax'] = durations[argmax]\n", + " row['section_mode_argmax'] = modes[argmax]\n", + " \n", + " except:\n", + " row['section_mode_argmax'] = np.nan\n", + " row['section_distance_argmax'] = np.nan\n", + " row['section_duration_argmax'] = np.nan\n", + " \n", + " finally:\n", + " return row" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0c008a3", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips.reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "7e1baa06", + "metadata": {}, + "source": [ + "### Available feature generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de49ec4f", + "metadata": {}, + "outputs": [], + "source": [ + "available = {\n", + " # AllCEO\n", + " 'Bicycle': 'p_micro',\n", + " 'Do not have vehicle': 'unknown',\n", + " 'Do not have vehicle ': 'unknown',\n", + " 'Get a ride from a friend or family member': 's_car',\n", + " 'None': 'no_trip',\n", + " 'Public transportation (bus, subway, light rail, etc.)': 'transit',\n", + " 'Rental car (including Zipcar/ Car2Go)': 'car',\n", + " 'Shared bicycle or scooter': 's_micro',\n", + " 'Skateboard': 'p_micro',\n", + " 'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail',\n", + " 'Walk/roll': 'walk',\n", + " 'Prefer not to say': 'unknown',\n", + " # Others\n", + " 'public_transportation__bus__subway__ligh': 'transit',\n", + " 'get_a_ride_from_a_friend_or_family_membe': 's_car', \n", + " 'bicycle': 'p_micro', \n", + " 'walk': 'walk',\n", + " 'taxi__regular_taxi__uber__lyft__etc': 'ridehail',\n", + " 'rental_car__including_zipcar__car2go': 'car', \n", + " 'prefer_not_to_say': 'unknown'\n", + "}\n", + "\n", + "# We use the sensed mode to update the available modes.\n", + "# This is to account for any user data input errors. E.g.: user does not select car as available mode\n", + "# but the sensed mode is car.\n", + "section_mode_mapping = {\n", + " 'bicycling': ['p_micro', 's_micro'],\n", + " 'car': ['s_car', 'car', 'ridehail'],\n", + " 'no_sensed': ['unknown'],\n", + " 'walking': ['walk'],\n", + " 'unknown': ['unknown'],\n", + " 'transit': ['transit']\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62960039", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips = filtered_trips.apply(lambda x: compute_argmax(CURRENT_DB, x), axis=1)\n", + "\n", + "# Drop all rows where argmax mode == air\n", + "filtered_trips.drop(\n", + " index=filtered_trips.loc[filtered_trips.section_mode_argmax.isin(['AIR_OR_HSR', 'air_or_hsr']),:].index, \n", + " inplace=True\n", + ")\n", + "\n", + "filtered_trips.section_mode_argmax.replace({\n", + " 'subway': 'transit',\n", + " 'no_sensed': 'unknown',\n", + " 'train': 'transit',\n", + " 'TRAM': 'transit',\n", + " 'LIGHT_RAIL': 'transit',\n", + " 'CAR': 'car',\n", + " 'WALKING': 'walking',\n", + " 'BICYCLING': 'bicycling',\n", + " 'UNKNOWN': 'unknown',\n", + " 'TRAIN': 'transit',\n", + " 'SUBWAY': 'transit',\n", + " 'BUS': 'transit',\n", + " 'bus': 'transit'\n", + "}, inplace=True)\n", + "\n", + "filtered_trips.dropna(subset='section_mode_argmax', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8583a709", + "metadata": {}, + "outputs": [], + "source": [ + "## Meters -> miles\n", + "filtered_trips['section_distance_argmax'] *= 0.000621371\n", + "\n", + "## Seconds -> minutes\n", + "filtered_trips['section_duration_argmax'] /= 60.\n", + "\n", + "## Total distance and duration are scaled too.\n", + "filtered_trips['distance'] *= 0.000621371\n", + "filtered_trips['duration'] /= 60." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4d05eb", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips = filtered_trips.merge(right=filtered_survey, left_on='user_id_join', right_on='user_id_join')" + ] + }, + { + "cell_type": "markdown", + "id": "383fe251", + "metadata": {}, + "source": [ + "## Update available indicators" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee097233", + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "\n", + "new_cols = list(set(available.values()))\n", + "filtered_trips[new_cols] = 0\n", + "\n", + "for user_id, user_trips in filtered_trips.groupby('user_id'):\n", + " \n", + " if CURRENT_DB == \"Stage_database\":\n", + " \n", + " # Get the set of available modes (demographics.)\n", + " all_av_modes = user_trips['available_modes'].str.split(';').explode()\n", + " else:\n", + " # Get the set of available modes (demographics.)\n", + " all_av_modes = user_trips['available_modes'].str.split().explode()\n", + " \n", + " # Get all sensed modes.\n", + " all_sections = user_trips['section_mode_argmax'].unique()\n", + " \n", + " # Map to Common Normal Form.\n", + " mapped_sections = set(list(itertools.chain.from_iterable([section_mode_mapping[x] for x in all_sections])))\n", + " mapped_demo_av = set([available[x] for x in all_av_modes.unique()])\n", + " \n", + " # Perform a set union.\n", + " combined = list(mapped_sections.union(mapped_demo_av))\n", + " \n", + " # Update dummy indicators.\n", + " filtered_trips.loc[filtered_trips.user_id == user_id, combined] = 1\n", + "\n", + "filtered_trips.rename(columns=dict([(c, 'av_'+c) for c in new_cols]), inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "38bfcc0c", + "metadata": {}, + "source": [ + "### Cost estimation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "054a6ad1", + "metadata": {}, + "outputs": [], + "source": [ + "# All values are taken from VTPI.\n", + "# https://www.vtpi.org/tca/tca0501.pdf\n", + "mode_cost_per_mile = {\n", + " # bicycle/skateboard\n", + " 'p_micro': 0.,\n", + " 'no_trip': 0.,\n", + " # Shared car is half the cost of regular car, which is $0.6/mile.\n", + " 's_car': 0.3,\n", + " # Rental car.\n", + " 'car': 0.6,\n", + " # Average of bus and train taken.\n", + " 'transit': 0.5,\n", + " # Shared bicyle or scooter - values taken from https://nacto.org/shared-micromobility-2020-2021/ and \n", + " # https://www.mckinsey.com/industries/automotive-and-assembly/our-insights/how-sharing-the-road-is-likely-to-transform-american-mobility\n", + " 's_micro': 0.3,\n", + " # uber/taxi/lyft\n", + " 'ridehail': 2.,\n", + " 'walk': 0.,\n", + " 'unknown': 0.\n", + "}\n", + "\n", + "# Assumptions.\n", + "mode_init_cost = {\n", + " 'p_micro': 0.,\n", + " 'no_trip': 0.,\n", + " # Shared car is half the cost of regular car, which is $0.6/mile.\n", + " 's_car': 0.,\n", + " # Rental car.\n", + " 'car': 0.,\n", + " # Average of bus and train taken.\n", + " 'transit': 0.,\n", + " # $1 unlocking cost.\n", + " 's_micro': 1.,\n", + " # uber/taxi/lyft\n", + " 'ridehail': 1.5,\n", + " 'walk': 0.,\n", + " 'unknown': 0.\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bccd3efb", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_cost_estimates(df: pd.DataFrame):\n", + " \n", + " # Create some extra colums.\n", + " columns = [c.replace('av_', '') for c in df.columns if 'av_' in c]\n", + "\n", + " # Initialize the columns to 0.\n", + " df[columns] = 0.\n", + "\n", + " rows = list()\n", + "\n", + " # Iterate over every row.\n", + " for _, row in df.iterrows():\n", + " # Check which flags are active.\n", + " row_dict = row.to_dict()\n", + "\n", + " # Access the section_distance_argmax attribute for the distance. Note that this is now in miles.\n", + " distance = row_dict['section_distance_argmax']\n", + " \n", + " # Mask using availability.\n", + " for lookup in columns:\n", + " row_dict[lookup] = row_dict['av_' + lookup] * (\n", + " mode_init_cost[lookup] + (mode_cost_per_mile[lookup] * distance)\n", + " )\n", + "\n", + " rows.append(row_dict)\n", + "\n", + " new_df = pd.DataFrame(rows)\n", + " new_df.rename(columns=dict([(c, 'cost_'+c) for c in columns]), inplace=True)\n", + "\n", + " return new_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c39f1901", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips = compute_cost_estimates(filtered_trips)" + ] + }, + { + "cell_type": "markdown", + "id": "a6c20466", + "metadata": {}, + "source": [ + "### Outlier removal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c05071cc", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"For {CURRENT_DB=}, before outlier removal, n_rows = {filtered_trips.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b222715f", + "metadata": {}, + "outputs": [], + "source": [ + "# Drop instances where duration/distance is unusable.\n", + "filtered_trips.drop(\n", + " index=filtered_trips.loc[(filtered_trips.section_distance_argmax <= 0) | (filtered_trips.section_duration_argmax <= 0), :].index,\n", + " inplace=False\n", + ").reset_index(drop=True, inplace=True)\n", + "\n", + "\n", + "# bus, train, bicycling, walking, car\n", + "# split-apply-combine\n", + "def drop_outliers(df: pd.DataFrame, low=0.1, high=0.9) -> pd.DataFrame:\n", + " \n", + " def filter_by_percentiles(group):\n", + " distance_low = group['section_distance_argmax'].quantile(low)\n", + " distance_high = group['section_distance_argmax'].quantile(high)\n", + " duration_low = group['section_duration_argmax'].quantile(low)\n", + " duration_high = group['section_duration_argmax'].quantile(high)\n", + " \n", + " l1_filter = group[\n", + " (group['section_distance_argmax'] >= distance_low) &\n", + " (group['section_distance_argmax'] <= distance_high)\n", + " ].reset_index(drop=True)\n", + " \n", + " l2_filter = l1_filter[\n", + " (l1_filter['section_duration_argmax'] >= duration_low) &\n", + " (l1_filter['section_duration_argmax'] <= duration_high)\n", + " ].reset_index(drop=True)\n", + " \n", + " return l2_filter\n", + " \n", + " return df.groupby('section_mode_argmax').apply(filter_by_percentiles).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d77febb3", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips = drop_outliers(filtered_trips, low=0.01, high=0.99)\n", + "\n", + "# Ideal speed. distance/time (in hours).\n", + "filtered_trips['mph'] = (\n", + " (filtered_trips['section_distance_argmax'] * 60.)/filtered_trips['section_duration_argmax']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b52d5325", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips[['section_mode_argmax', 'section_duration_argmax', 'section_distance_argmax', 'mph']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7ed953d", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_mph(df: pd.DataFrame, low=0.1, high=0.9) -> pd.DataFrame:\n", + " \n", + " MPH_THRESHOLDS = {\n", + " # https://www.sciencedirect.com/science/article/pii/S2210670718304682\n", + " 'bicycling': 15.,\n", + " # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7806575/\n", + " 'walking': 2.93\n", + " }\n", + " \n", + " def custom_filter(group):\n", + " # Drop data specified in the dict manually.\n", + " if group.name in MPH_THRESHOLDS.keys():\n", + " f_df = group[group['mph'] <= MPH_THRESHOLDS[group.name]]\n", + " else:\n", + " mph_low = group['mph'].quantile(low)\n", + " mph_high = group['mph'].quantile(high)\n", + "\n", + " f_df = group[(group['mph'] >= mph_low) & (group['mph'] <= mph_high)]\n", + " \n", + " return f_df\n", + " \n", + " return df.groupby('section_mode_argmax').apply(custom_filter).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c1904cd", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips = filter_mph(filtered_trips, low=0.01, high=0.99)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dce2b1c", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips.groupby('section_mode_argmax')[['section_distance_argmax', 'section_duration_argmax']].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "396f196b", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips.groupby('section_mode_argmax')[['mph']].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41109148", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"For {CURRENT_DB=}, After outlier removal, n_rows = {filtered_trips.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ca22a08", + "metadata": {}, + "outputs": [], + "source": [ + "to_drop=[\n", + " '_id', 'additions', 'cleaned_section_summary', 'cleaned_trip', 'confidence_threshold', \n", + " 'end_fmt_time', 'end_loc', 'end_local_dt_day', 'raw_trip', 'purpose_confirm',\n", + " 'end_local_dt_minute', 'end_local_dt_month', 'end_local_dt_second', 'end_local_dt_timezone', \n", + " 'end_local_dt_weekday', 'end_local_dt_year', 'end_place', 'end_ts', 'expectation', 'expected_trip', \n", + " 'inferred_labels', 'inferred_section_summary', 'inferred_trip', 'metadata_write_ts', 'mode_confirm', \n", + " 'section_durations', 'section_modes', 'source', 'start_fmt_time', 'start_loc', 'start_local_dt_day', \n", + " 'start_local_dt_minute', 'start_local_dt_month', 'start_local_dt_second', \n", + " 'start_local_dt_timezone', 'start_local_dt_weekday', 'start_local_dt_year', 'start_place', \n", + " 'start_ts', 'user_id_join', 'user_input', 'survey_user_id', 'section_distances',\n", + " 'data.local_dt.year', 'data.local_dt.month', 'data.local_dt.day', 'data.local_dt.hour', \n", + " 'data.local_dt.minute', 'data.local_dt.second', 'data.local_dt.weekday', 'data.local_dt.timezone',\n", + " 'data.fmt_time'\n", + "]\n", + "\n", + "for col in to_drop:\n", + " if col in filtered_trips.columns:\n", + " filtered_trips.drop(columns=[col], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2937d4ef", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_trips.rename({'start_local_dt_hour': 'start:hour', 'end_local_dt_hour': 'end:hour'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87c7fc92", + "metadata": {}, + "outputs": [], + "source": [ + "print(filtered_trips.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ea36cad", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "display(filtered_trips.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7018bf4", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Done processing for {CURRENT_DB=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eacc539", + "metadata": {}, + "outputs": [], + "source": [ + "targets = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']\n", + "\n", + "# Rename and map targets.\n", + "filtered_trips.rename(columns={'replaced_mode': 'target'}, inplace=True)\n", + "filtered_trips.replace({'target': {t: ix+1 for ix, t in enumerate(targets)}}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50d3eaec", + "metadata": {}, + "outputs": [], + "source": [ + "display(filtered_trips.target.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31f35a04", + "metadata": {}, + "outputs": [], + "source": [ + "savepath = Path('./data/filtered_data')\n", + "\n", + "if not savepath.exists():\n", + " savepath.mkdir()\n", + "\n", + "filtered_trips.to_csv(savepath / f'preprocessed_data_{CURRENT_DB}.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/replacement_mode_modeling/02_run_trip_level_models.py b/replacement_mode_modeling/02_run_trip_level_models.py new file mode 100644 index 0000000..3976ee1 --- /dev/null +++ b/replacement_mode_modeling/02_run_trip_level_models.py @@ -0,0 +1,491 @@ +from enum import Enum +import random +import warnings +import argparse +from pathlib import Path +from collections import Counter + +# Math and graphing. +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +# sklearn imports. +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression +from sklearn.metrics import f1_score, r2_score, ConfusionMatrixDisplay +from scipy.special import kl_div +from sklearn.metrics import classification_report +from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold +from pprint import pprint +from sklearn.inspection import permutation_importance +from time import perf_counter +from sklearn.ensemble import RandomForestClassifier + +warnings.simplefilter(action='ignore', category=Warning) + +# Global experiment flags and variables. +SEED = 13210 +TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown'] +MAP = {ix+1:t for ix, t in enumerate(TARGETS)} + +CV = False + +# Set the Numpy seed too. +random.seed(SEED) +np.random.seed(SEED) + +class SPLIT_TYPE(Enum): + INTRA_USER = 0 + INTER_USER = 1 + TARGET = 2 + MODE = 3 + HIDE_USER = 4 + + +class SPLIT(Enum): + TRAIN = 0 + TEST = 1 + + +def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True): + + if how == SPLIT_TYPE.INTER_USER: + + X = data.drop(columns=['target']) + y = data['target'].values + groups = data.user_id.values + + # n_splits determines split size. So n=5, is 20% for each split, which is what we want. + splitter = StratifiedGroupKFold(n_splits=5, shuffle=shuffle, random_state=SEED) + # splitter = GroupKFold(n_splits=5) + + for train_index, test_index in splitter.split(X, y, groups): + X_tr = data.iloc[train_index, :] + X_te = data.iloc[test_index, :] + + # Iterate only once and break. + break + + return X_tr, X_te, None + + elif how == SPLIT_TYPE.INTRA_USER: + + # There are certain users with only one observation. What do we do with those? + # As per the mobilitynet modeling pipeline, we randomly assign them to either the + # training or test set. + + value_counts = data.user_id.value_counts() + single_count_ids = value_counts[value_counts == 1].index + + data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True) + data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True) + + X_tr, X_te = train_test_split( + data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id, + random_state=SEED + ) + + data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts)) + X_tr_merged = pd.concat( + [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop( + columns=['assigned'], inplace=False + )], + ignore_index=True, axis=0 + ) + + X_te_merged = pd.concat( + [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop( + columns=['assigned'], inplace=False + )], + ignore_index=True, axis=0 + ) + + return X_tr_merged, X_te_merged, None + + elif how == SPLIT_TYPE.TARGET: + + X_tr, X_te = train_test_split( + data, test_size=test_ratio, shuffle=shuffle, stratify=data.target, + random_state=SEED + ) + + return X_tr, X_te, None + + elif how == SPLIT_TYPE.MODE: + X_tr, X_te = train_test_split( + data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax, + random_state=SEED + ) + + return X_tr, X_te, None + + + elif how == SPLIT_TYPE.HIDE_USER: + users = data.user_id.value_counts(normalize=True) + percentiles = users.quantile([0.25, 0.5, 0.75]) + + low_trip_users = users[users <= percentiles[0.25]].index + mid_trip_users = users[(percentiles[0.25] <= users) & (users <= percentiles[0.5])].index + high_trip_users = users[(percentiles[0.5] <= users) & (users <= percentiles[0.75])].index + + # select one from each randomly. + user1 = np.random.choice(low_trip_users) + user2 = np.random.choice(mid_trip_users) + user3 = np.random.choice(high_trip_users) + + print(f"Users picked: {user1}, {user2}, {user3}") + + # Remove these users from the entire dataset. + held_out = data.loc[data.user_id.isin([user1, user2, user3]), :].reset_index(drop=True) + remaining = data.loc[~data.user_id.isin([user1, user2, user3]), :].reset_index(drop=True) + + # Split randomly. + X_tr, X_te = train_test_split( + remaining, test_size=test_ratio, shuffle=shuffle, random_state=SEED + ) + + return X_tr, X_te, held_out + + raise NotImplementedError("Unknown split type") + + +def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict): + + X_features = ['section_distance_argmax', 'mph'] + + if dset == SPLIT.TRAIN and model_dict is None: + model_dict = dict() + + if dset == SPLIT.TEST and model_dict is None: + raise AttributeError("Expected model dict for testing.") + + if dset == SPLIT.TRAIN: + for section_mode in df.section_mode_argmax.unique(): + section_data = df.loc[df.section_mode_argmax == section_mode, :] + if section_mode not in model_dict: + model_dict[section_mode] = dict() + + model = LinearRegression(fit_intercept=True) + + X = section_data[X_features] + Y = section_data[['section_duration_argmax']] + + model.fit(X, Y.values.ravel()) + + r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel()) + print(f"\t-> Train R2 for {section_mode}: {r2}") + + model_dict[section_mode]['model'] = model + + elif dset == SPLIT.TEST: + for section_mode in df.section_mode_argmax.unique(): + section_data = df.loc[df.section_mode_argmax == section_mode, :] + X = section_data[X_features] + Y = section_data[['section_duration_argmax']] + + y_pred = model_dict[section_mode]['model'].predict(X) + r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel()) + print(f"\t-> Test R2 for {section_mode}: {r2}") + + # Create the new columns for the duration. + new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown'] + df[TARGETS] = 0 + df['temp'] = 0 + + for section in df.section_mode_argmax.unique(): + X_section = df.loc[df.section_mode_argmax == section, X_features] + + # broadcast to all columns. + df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section) + + for c in TARGETS: + df[c] = df['av_' + c] * df['temp'] + + df.drop(columns=['temp'], inplace=True) + + df.rename(columns=dict([(x, 'tt_'+x) for x in TARGETS]), inplace=True) + + # return model_dict, result_df + return model_dict, df + +# Some helper functions that will help ease redundancy in the code. + +def drop_columns(df: pd.DataFrame): + to_drop = ['section_mode_argmax', 'available_modes', 'user_id'] + + # Drop section_mode_argmax and available_modes. + return df.drop( + columns=to_drop, + inplace=False + ) + + +def scale_values(df: pd.DataFrame, split: SPLIT, scalers=None): + # Scale costs using StandardScaler. + costs = df[[c for c in df.columns if 'cost_' in c]].copy() + times = df[[c for c in df.columns if 'tt_' in c or 'duration' in c]].copy() + distances = df[[c for c in df.columns if 'distance' in c or 'mph' in c]].copy() + + print( + "Cost columns to be scaled: ", costs.columns,"\nTime columns to be scaled: ", times.columns, \ + "\nDistance columns to be scaled: ", distances.columns + ) + + if split == SPLIT.TRAIN and scalers is None: + cost_scaler = StandardScaler() + tt_scaler = StandardScaler() + dist_scaler = StandardScaler() + + cost_scaled = pd.DataFrame( + cost_scaler.fit_transform(costs), + columns=costs.columns, + index=costs.index + ) + + tt_scaled = pd.DataFrame( + tt_scaler.fit_transform(times), + columns=times.columns, + index=times.index + ) + + dist_scaled = pd.DataFrame( + dist_scaler.fit_transform(distances), + columns=distances.columns, + index=distances.index + ) + + elif split == SPLIT.TEST and scalers is not None: + + cost_scaler, tt_scaler, dist_scaler = scalers + + cost_scaled = pd.DataFrame( + cost_scaler.transform(costs), + columns=costs.columns, + index=costs.index + ) + + tt_scaled = pd.DataFrame( + tt_scaler.transform(times), + columns=times.columns, + index=times.index + ) + + dist_scaled = pd.DataFrame( + dist_scaler.transform(distances), + columns=distances.columns, + index=distances.index + ) + + else: + raise NotImplementedError("Unknown split") + + # Drop the original columns. + df.drop( + columns=costs.columns.tolist() + times.columns.tolist() + distances.columns.tolist(), + inplace=True + ) + + df = df.merge(right=cost_scaled, left_index=True, right_index=True) + df = df.merge(right=tt_scaled, left_index=True, right_index=True) + df = df.merge(right=dist_scaled, left_index=True, right_index=True) + + return df, (cost_scaler, tt_scaler, dist_scaler) + + +def train(X_tr, Y_tr): + if CV: + + model = RandomForestClassifier(random_state=SEED) + + # We want to build bootstrapped trees that would not always use all the features. + param_set2 = { + 'n_estimators': [150, 200, 250], + 'min_samples_split': [2, 3, 4], + 'min_samples_leaf': [1, 2, 3], + 'class_weight': ['balanced_subsample'], + 'max_features': [None, 'sqrt'], + 'bootstrap': [True] + } + + cv_set2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED) + + clf_set2 = GridSearchCV(model, param_set2, cv=cv_set2, n_jobs=-1, scoring='f1_weighted', verbose=1) + + start = perf_counter() + + clf_set2.fit( + X_tr, + Y_tr + ) + + time_req = (perf_counter() - start)/60. + + best_model = clf_set2.best_estimator_ + else: + best_model = RandomForestClassifier( + n_estimators=150, + max_depth=None, + min_samples_leaf=2, + bootstrap=True, + class_weight='balanced_subsample', + random_state=SEED, + n_jobs=-1 + ).fit(X_tr, Y_tr) + + return best_model + + +def predict(model, X_tr, Y_tr, X_te, Y_te): + + y_test_pred = model.predict(X_te) + y_train_pred = model.predict(X_tr) + + train_f1 = f1_score( + y_true=Y_tr, + y_pred=y_train_pred, + average='weighted', + zero_division=0. + ) + + test_f1 = f1_score( + y_true=Y_te, + y_pred=y_test_pred, + average='weighted', + zero_division=0. + ) + + return y_train_pred, train_f1, y_test_pred, test_f1 + + +def run_sampled_sweep(df: pd.DataFrame, dir_name: Path, **kwargs): + + targets = TARGETS.copy() + + split = kwargs.pop('split', None) + + try: + train_data, test_data, hidden_data = get_train_test_splits(data=df, how=split, shuffle=True) + except Exception as e: + print(e) + return + + params, train_data = get_duration_estimate(train_data, SPLIT.TRAIN, None) + _, test_data = get_duration_estimate(test_data, SPLIT.TEST, params) + + train_data = drop_columns(train_data) + test_data = drop_columns(test_data) + + X_tr, Y_tr = train_data.drop(columns=['target'], inplace=False), train_data.target.values.ravel() + X_te, Y_te = test_data.drop(columns=['target'], inplace=False), test_data.target.values.ravel() + + model = train(X_tr, Y_tr) + tr_preds, tr_f1, te_preds, te_f1 = predict(model, X_tr, Y_tr, X_te, Y_te) + + print(f"\t-> Train F1: {tr_f1}, Test F1: {te_f1}") + + importance = sorted( + zip( + model.feature_names_in_, + model.feature_importances_ + ), + key=lambda x: x[-1], reverse=True + ) + + with open(dir_name / 'f1_scores.txt', 'w') as f: + f.write(f"Train F1: {tr_f1}\nTest F1: {te_f1}") + + importance_df = pd.DataFrame(importance, columns=['feature_name', 'importance']) + importance_df.to_csv(dir_name / 'feature_importance.csv', index=False) + + # target_names = [MAP[x] for x in np.unique(Y_te)] + + with open(dir_name / 'classification_report.txt', 'w') as f: + f.write(classification_report(y_true=Y_te, y_pred=te_preds)) + + if split == SPLIT_TYPE.HIDE_USER and hidden_data is not None: + _, hidden_data = get_duration_estimate(hidden_data, SPLIT.TEST, params) + hidden_data = drop_columns(hidden_data) + + X_hid, Y_hid = hidden_data.drop(columns=['target'], inplace=False), hidden_data.target.values.ravel() + + tr_preds, tr_f1, te_preds, te_f1 = predict(model, X_tr, Y_tr, X_hid, Y_hid) + print(f"\t\t ---> Hidden user F1: {te_f1} <---") + + fig, ax = plt.subplots(figsize=(7, 7)) + cm = ConfusionMatrixDisplay.from_estimator( + model, + X=X_te, + y=Y_te, + ax=ax + ) + # ax.set_xticklabels(target_names, rotation=45) + # ax.set_yticklabels(target_names) + fig.tight_layout() + plt.savefig(dir_name / 'test_confusion_matrix.png') + plt.close('all') + + +def save_metadata(dir_name: Path, **kwargs): + with open(dir_name / 'metadata.txt', 'w') as f: + for k, v in kwargs.items(): + f.write(f"{k}: {v}\n") + + + +if __name__ == "__main__": + + datasets = sorted(list(Path('./data/filtered_data').glob('preprocessed_data_*.csv'))) + + start = perf_counter() + + for dataset in datasets: + name = dataset.name.replace('.csv', '') + + print(f"Starting modeling for dataset = {name}") + + data = pd.read_csv(dataset) + data.drop_duplicates(inplace=True) + data.dropna(inplace=True) + + if 'deprecatedID' in data.columns: + data.drop(columns=['deprecatedID'], inplace=True) + if 'data.key' in data.columns: + data.drop(columns=['data.key'], inplace=True) + + # These two lines make all the difference. + data.sort_values(by=['user_id'], ascending=True, inplace=True) + data = data[sorted(data.columns.tolist())] + + print("Beginning sweeps.") + + # args = parse_args() + sweep_number = 1 + + root = Path('./outputs/benchmark_results') + if not root.exists(): + root.mkdir() + + for split in [SPLIT_TYPE.INTER_USER, SPLIT_TYPE.INTRA_USER, SPLIT_TYPE.TARGET, SPLIT_TYPE.MODE, SPLIT_TYPE.HIDE_USER]: + kwargs = { + 'dataset': name, + 'split': split + } + + dir_name = root / f'benchmark_{name}_{sweep_number}' + + if not dir_name.exists(): + dir_name.mkdir() + + print(f"\t-> Running sweep #{sweep_number} with metadata={str(kwargs)}") + save_metadata(dir_name, **kwargs) + run_sampled_sweep(data.copy(), dir_name, **kwargs) + print(f"Completed benchmarking for {sweep_number} experiment.") + print(50*'-') + sweep_number += 1 + + elapsed = perf_counter() - start + + print(f"Completed sweeps in {elapsed/60.} minutes") \ No newline at end of file diff --git a/replacement_mode_modeling/03_user_level_models.ipynb b/replacement_mode_modeling/03_user_level_models.ipynb new file mode 100644 index 0000000..da06468 --- /dev/null +++ b/replacement_mode_modeling/03_user_level_models.ipynb @@ -0,0 +1,1120 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "04ccf092", + "metadata": {}, + "source": [ + "## Some important points to remember:\n", + "\n", + "### We want to experiment with two types of models:\n", + "\n", + "\n", + "1. have one row per user, so that when predicting modes for a new user, we pick the \"similar user\" or users and determine the replaced mode\n", + " - In this, the traditional approach would only use demographics for the user features, we may experiment with some summaries of the trip data that will function as some level of \"fingerprint\" for the user. Ideally we would be able to show that this performs better than demographics alone\n", + " - Note also that the original method that you had outlined where the training set is a list of trips (O()) is a third approach which we will be comparing these two against" + ] + }, + { + "cell_type": "markdown", + "id": "c0c1ee88", + "metadata": {}, + "source": [ + "Target order:\n", + "\n", + "```\n", + "['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ef0f2e", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import os\n", + "import pickle\n", + "import ast\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import r2_score, f1_score, log_loss\n", + "from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances\n", + "from enum import Enum\n", + "from scipy.stats import uniform\n", + "from typing import List, Dict, Union\n", + "from pandas.api.types import is_numeric_dtype\n", + "from sklearn.manifold import TSNE\n", + "from multiprocessing import cpu_count\n", + "\n", + "pd.set_option('display.max_columns', 100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef98692", + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 13210\n", + "\n", + "np.random.seed(SEED)\n", + "random.seed(SEED)\n", + "\n", + "SimilarityMetric = Enum('SimilarityMetric', ['COSINE', 'EUCLIDEAN', 'KNN', 'KMEANS'])\n", + "GroupType = Enum('GroupType', ['GROUPBY', 'CUT'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f8c51a", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('./data/filtered_data/preprocessed_data_Stage_database.csv')\n", + "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv')\n", + "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv')\n", + "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv')\n", + "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "915e9d6f", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax()).unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72793473", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765f08ff", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_tsne_plots(df: pd.DataFrame, **kwargs):\n", + " \n", + " df = df.copy()\n", + " \n", + " # Important - if not cast as a category, seaborn considers this as a numerical value.\n", + " df.target = df.target.astype('category')\n", + " \n", + " # print(\"Unique targets: \", df.target.unique())\n", + " \n", + " # According to the docs, > consider choosing a perplexity between 5 and 50.\n", + " tsne = TSNE(\n", + " n_components=2,\n", + " perplexity=kwargs.pop('perplexity', 5),\n", + " n_iter=kwargs.pop('n_iter', 2000),\n", + " metric=kwargs.pop('metric', 'cosine'),\n", + " random_state=SEED,\n", + " n_jobs=os.cpu_count()\n", + " )\n", + " \n", + " if df.index.name == 'user_id':\n", + " df.reset_index(drop=False, inplace=True)\n", + " \n", + " if 'user_id' in df.columns:\n", + " df.drop(columns=['user_id'], inplace=True)\n", + " \n", + " targets = df.target.values\n", + " df.drop(columns=['target'], inplace=True)\n", + " \n", + " projected = tsne.fit_transform(df)\n", + " \n", + " fig, ax = plt.subplots()\n", + " sns.scatterplot(x=projected[:, 0], y=projected[:, 1], hue=targets, ax=ax)\n", + " ax.set(xlabel='Embedding dimension 1', ylabel='Embedding dimension 2', title='t-SNE plot for data')\n", + " plt.show()\n", + " \n", + " return projected" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfe76e8c", + "metadata": {}, + "outputs": [], + "source": [ + "def get_mode_coverage(df: pd.DataFrame):\n", + " \n", + " coverage_df = df.groupby(['user_id', 'section_mode_argmax']).size().unstack(fill_value=0)\n", + " coverage_df.columns = ['coverage_' + str(c) for c in coverage_df.columns]\n", + " \n", + " # As a preventative measure.\n", + " coverage_df.fillna(0, inplace=True)\n", + " \n", + " # Normalize over rows.\n", + " coverage_df.iloc[:, 1:] = coverage_df.iloc[:, 1:].div(coverage_df.iloc[:, 1:].sum(axis=1), axis=0)\n", + " \n", + " return coverage_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75313008", + "metadata": {}, + "outputs": [], + "source": [ + "def get_trip_summaries(df: pd.DataFrame, group_key: str, feature_list: List[str], **kwargs):\n", + " \n", + " def get_feature_summaries(trip_feature: str, is_ordinal: bool = False):\n", + " \n", + " if is_numeric_dtype(df[group_key]):\n", + " col_prefix = f'{trip_feature}_mean_cut'\n", + " if not use_qcut:\n", + " grouper = df.groupby(['user_id', pd.cut(df[group_key], n_cuts)])[trip_feature]\n", + " else:\n", + " grouper = df.groupby(['user_id', pd.qcut(df[group_key], n_cuts)])[trip_feature]\n", + " else:\n", + " grouper = df.groupby(['user_id', group_key])[trip_feature]\n", + " \n", + " if not is_ordinal:\n", + " # A mean of 0 is an actual value.\n", + " \n", + " mean = grouper.mean().unstack(level=-1, fill_value=-1.)\n", + " \n", + " mean.columns = [f'{trip_feature}_mean_' + str(c) for c in mean.columns]\n", + " \n", + " # Same with percentiles - 0 is an actual value.\n", + " median = grouper.median().unstack(level=-1, fill_value=-1.)\n", + " median.columns = [f'{trip_feature}_median_' + str(c) for c in median.columns]\n", + " \n", + " iqr_df = grouper.quantile([0.25, 0.75]).unstack(level=-1)\n", + " iqr = (iqr_df[0.75] - iqr_df[0.25]).unstack(level=-1)\n", + " iqr.fillna(-1., inplace=True)\n", + " iqr.columns = [f'{trip_feature}_iqr_' + str(c) for c in iqr.columns]\n", + "\n", + " # Now merge.\n", + " merged = mean.copy()\n", + " merged = merged.merge(right=median, left_index=True, right_index=True)\n", + " merged = merged.merge(right=iqr, left_index=True, right_index=True)\n", + " \n", + " merged.fillna(-1., inplace=True)\n", + "\n", + " return merged\n", + " \n", + " # 0 is OK to indicate NaN values.\n", + " f_mode = grouper.apply(\n", + " lambda x: x.value_counts().idxmax()\n", + " ).unstack(fill_value=0.)\n", + " \n", + " f_mode.columns = [f'{trip_feature}_mode_' + str(c) for c in f_mode.columns]\n", + " f_mode.fillna(0., inplace=True)\n", + " \n", + " return f_mode\n", + " \n", + " assert group_key not in feature_list, \"Cannot perform grouping and summarization of the same feature.\"\n", + " \n", + " # Optional kwarg for number of cuts for numeric dtype grouping.\n", + " # Default is 3: short, medium, long trip types:\n", + " # For e.g., if the group key is 'section_duration', it will be cut into three equally-sized bins,\n", + " # However, an alternative is also present - we could use qcut() instead, which would ensure that\n", + " # each bin has roughly the same number of samples.\n", + " n_cuts = kwargs.pop('n_cuts', 3)\n", + " use_qcut = kwargs.pop('use_qcut', False)\n", + " \n", + " # This will be the dataframe that all subsequent features will join to.\n", + " feature_df = None\n", + " \n", + " for ix, feature in enumerate(feature_list):\n", + " is_ordinal = feature == 'start_local_dt_hour' or feature == 'end_local_dt_hour'\n", + " if ix == 0:\n", + " feature_df = get_feature_summaries(feature, is_ordinal)\n", + " else:\n", + " next_feature_df = get_feature_summaries(feature, is_ordinal)\n", + " feature_df = feature_df.merge(right=next_feature_df, left_index=True, right_index=True)\n", + " \n", + " return feature_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63617ada", + "metadata": {}, + "outputs": [], + "source": [ + "def get_demographic_data(df: pd.DataFrame, **trip_kwargs):\n", + " \n", + " '''\n", + " A method that returns a U x (D + t) matrix, where U = number of users,\n", + " D = number of demographic features, t (optional) = number of trip summary features.\n", + " \n", + " When use_trip_summaries=True, the 'available_modes' column is dropped in favor of\n", + " the already-preprocessed av_ columns. This is because we want to incorporate trip-level\n", + " information into the data. When the argument is False, we want to SOLELY use demographics.\n", + " '''\n", + " \n", + " trip_features_to_use = trip_kwargs.pop('trip_features', None)\n", + " trip_group_key = trip_kwargs.pop('trip_grouping', 'section_mode_argmax')\n", + " \n", + " demographics = [ \n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", + " 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles',\n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'n_working_residents', \n", + " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", + " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', 'primary_job_description_Food service', \n", + " 'primary_job_description_Linecook', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Non-profit program manager', \n", + " 'primary_job_description_Other', 'primary_job_description_Professional, managerial, or technical', \n", + " 'primary_job_description_Sales or service', 'primary_job_description_Self employed', \n", + " 'primary_job_description_food service', 'gender_Man', 'gender_Nonbinary/genderqueer/genderfluid', \n", + " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + " 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', 'age_31___35_years_old', \n", + " 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old', 'age_51___55_years_old', \n", + " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old', 'av_transit', 'av_no_trip', \n", + " 'av_p_micro', 'av_s_micro', 'av_ridehail', 'av_unknown', 'av_walk', 'av_car', 'av_s_car', \n", + " ]\n", + " \n", + " # Retain only the first instance of each user and subset the columns.\n", + " filtered = df.groupby('user_id').first()[demographics]\n", + " \n", + " # Get the targets.\n", + " targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())\n", + " \n", + " filtered = filtered.merge(right=targets, left_index=True, right_index=True)\n", + " \n", + " if trip_features_to_use is None or len(trip_features_to_use) == 0:\n", + "# # Use the available modes as indicators.\n", + "# return encode_availability(filtered)\n", + " return filtered\n", + " \n", + " # -----------------------------------------------------------\n", + " # Reaching here means that we need to include trip summaries\n", + " # -----------------------------------------------------------\n", + " \n", + " # If trip summaries are to be used, then re-use the preprocessed availability features.\n", + " availability = df[['user_id'] + [c for c in df.columns if 'av_' in c]]\n", + " availability = availability.groupby('user_id').first()\n", + " \n", + " # For every user, generate the global trip-level summaries.\n", + " global_aggs = df.groupby('user_id').agg({'duration': 'mean', 'distance': 'mean'})\n", + " \n", + " # coverage.\n", + " coverage = get_mode_coverage(df)\n", + " \n", + " # Trip-level features.\n", + " trip_features = get_trip_summaries(\n", + " df=df, \n", + " group_key=trip_group_key, \n", + " feature_list=trip_features_to_use,\n", + " use_qcut=trip_kwargs.pop('use_qcut', False)\n", + " )\n", + " \n", + " targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())\n", + " \n", + " trip_features = trip_features.merge(right=coverage, left_index=True, right_index=True)\n", + " trip_features = trip_features.merge(right=global_aggs, left_index=True, right_index=True)\n", + " \n", + " # Finally, join with availability indicators and targets.\n", + " trip_features = trip_features.merge(right=availability, left_index=True, right_on='user_id')\n", + " trip_features = trip_features.merge(right=targets, left_index=True, right_index=True)\n", + " \n", + " return trip_features.reset_index(drop=False)" + ] + }, + { + "cell_type": "markdown", + "id": "fedb51e8", + "metadata": {}, + "source": [ + "## Experiment 1: Only demographics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66421120", + "metadata": {}, + "outputs": [], + "source": [ + "## Educated suburban woman -> \n", + "# An embedding where:\n", + "# \"highest_education_Bachelor's degree\" == 1 or 'highest_education_Graduate degree or professional degree' == 1\n", + "# income_category >= 4 ( + more features that define 'suburban-ness')\n", + "# gender_Woman == 1\n", + "\n", + "demo_df = get_demographic_data(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17196eaf", + "metadata": {}, + "outputs": [], + "source": [ + "display(demo_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c458c1a", + "metadata": {}, + "outputs": [], + "source": [ + "tsne_kwargs = {\n", + " 'perplexity': 6,\n", + " 'n_iter': 7500,\n", + " 'metric': 'cosine'\n", + "}\n", + "\n", + "## PLOT BY THE WAY IN WHICH PEOPLE USE THE SAME REPLACED MODE AND CHECK THE SIMILARITY.\n", + "\n", + "projections = generate_tsne_plots(demo_df, **tsne_kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c023cf66", + "metadata": {}, + "outputs": [], + "source": [ + "# No stratification, pure random.\n", + "demo_df.reset_index(drop=False, inplace=True)\n", + "train, test = train_test_split(demo_df, test_size=0.2, random_state=SEED)\n", + "\n", + "TRAIN_USERS = train.user_id.unique().tolist()\n", + "TEST_USERS = test.user_id.unique().tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "376a4391", + "metadata": {}, + "outputs": [], + "source": [ + "print(train.shape[0], test.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "630d6c08", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensuring that no user information is leaked across sets.\n", + "assert train.shape[0] + test.shape[0] == len(df.user_id.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef77c9c8", + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_using_similarity(test_df, train_df, metric=SimilarityMetric.COSINE, **metric_kwargs):\n", + " \n", + " '''\n", + " This method treats each user row as a 'fingerprint' (embedding vector). We assume that we\n", + " have no idea about the test set labels. To find which replaced mode is most likely for the test\n", + " users, we compute the cosine similarity of each test user against the users in the training set.\n", + " For the most similar user, we use their target as a proxy for the test user's replaced mode.\n", + " This operates on the following intuition: If User A and User B are similar, then their replaced\n", + " modes are also similar.\n", + " '''\n", + " \n", + " tr_targets = train_df.target.values\n", + " tr = train_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)\n", + " \n", + " te_targets = test_df.target.values\n", + " te = test_df.drop(columns=['target', 'user_id'], inplace=False).reset_index(drop=True, inplace=False)\n", + " \n", + " if metric == SimilarityMetric.COSINE:\n", + " # Use cosine similarity to determine which element in the train set this user is closest to.\n", + " # Offset the columns from the second entry to exclude the user_id column.\n", + " # Returns a (n_te, n_tr) matrix.\n", + " sim = cosine_similarity(te.values, tr.values)\n", + " \n", + " # Compute the argmax across the train set.\n", + " argmax = np.argmax(sim, axis=1)\n", + "\n", + " # Index into the training targets to retrieve predicted label.\n", + " y_test_pred = tr_targets[argmax]\n", + " \n", + " elif metric == SimilarityMetric.EUCLIDEAN:\n", + " \n", + " # Here, we choose the embedding with the smallest L2 distance.\n", + " distances = euclidean_distances(te.values, tr.values)\n", + " \n", + " # We choose argmin\n", + " argmin = np.argmin(distances, axis=1)\n", + " \n", + " # Index into the targets.\n", + " y_test_pred = tr_targets[argmin]\n", + " \n", + " elif metric == SimilarityMetric.KNN:\n", + " \n", + " # Build the KNN classifier. By default, let it be 3.\n", + " knn = KNeighborsClassifier(\n", + " n_neighbors=metric_kwargs.pop('n_neighbors', 3),\n", + " weights='distance',\n", + " metric=metric_kwargs.pop('knn_metric', 'cosine'),\n", + " n_jobs=os.cpu_count()\n", + " )\n", + " \n", + " # Fit the data to the KNN model\n", + " knn.fit(tr, tr_targets)\n", + " \n", + " y_test_pred = knn.predict(te)\n", + " \n", + " elif metric == SimilarityMetric.KMEANS:\n", + " \n", + " # Build the model.\n", + " kmeans = KMeans(\n", + " n_clusters=metric_kwargs.pop('n_clusters', 8),\n", + " max_iter=metric_kwargs.pop('max_iter', 300),\n", + " n_init='auto',\n", + " random_state=SEED\n", + " )\n", + " \n", + " # Fit the clustering model\n", + " kmeans.fit(tr)\n", + " \n", + " # Construct the auxiliary df and merge with the training set.\n", + " label_df = pd.DataFrame({'label': kmeans.labels_, 'target': tr_targets}, index=tr.index)\n", + " \n", + " # Now, perform an inference on the test set.\n", + " predicted_labels = kmeans.predict(te)\n", + " \n", + " y_test_pred = []\n", + " for prediction in predicted_labels:\n", + " most_likely = label_df.loc[label_df.label == prediction, 'target'].value_counts().idxmax()\n", + " y_test_pred.append(most_likely)\n", + " \n", + " else:\n", + " raise NotImplementedError(\"Unknown similarity metric\")\n", + " \n", + " \n", + " f1 = f1_score(y_true=te_targets, y_pred=y_test_pred, average='weighted')\n", + " print(f\"Test F1 score using {metric.name} = {f1}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a95ad5e", + "metadata": {}, + "outputs": [], + "source": [ + "for metric in [\n", + " SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS\n", + "]:\n", + " evaluate_using_similarity(test, train, metric, n_clusters=3)" + ] + }, + { + "cell_type": "markdown", + "id": "16e435a6", + "metadata": {}, + "source": [ + "Not bad - using just a simple random split gives us the following results:\n", + "\n", + "$allCEO$:\n", + "\n", + "```\n", + "Test F1 score using COSINE = 0.42692939244663386\n", + "Test F1 score using EUCLIDEAN = 0.4126984126984127\n", + "Test F1 score using KNN = 0.4393241167434716\n", + "Test F1 score using KMEANS = 0.4733893557422969\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f0e842", + "metadata": {}, + "outputs": [], + "source": [ + "def custom_nll_scorer(clf, X, y):\n", + " \n", + " # [[yp1, yp2, yp3, ...], [yp1, yp3, ...]]\n", + " y_pred = clf.predict_proba(X)\n", + " \n", + " return -log_loss(y_true=y, y_pred=y_pred, labels=sorted(np.unique(y)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3a6af8f", + "metadata": {}, + "outputs": [], + "source": [ + "def estimate_using_model(train, test, **model_kwargs):\n", + " \n", + " cv = model_kwargs.pop('cv', None)\n", + " n_splits = model_kwargs.pop('n_splits', 5)\n", + " n_iter = model_kwargs.pop('n_iter', 500)\n", + " \n", + " if cv is None:\n", + " # Define the train-val splitter.\n", + " cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)\n", + " \n", + " params = {\n", + " 'n_estimators': np.arange(100, 1001, 50),\n", + " 'max_depth': [i for i in range(5, 101, 5)],\n", + " 'ccp_alpha': np.linspace(0, 1, 10),\n", + " 'class_weight': ['balanced', 'balanced_subsample', None],\n", + " 'min_samples_split': np.arange(2, 25, 2),\n", + " 'min_samples_leaf': np.arange(1, 25)\n", + " }\n", + " \n", + " rf = RandomForestClassifier(random_state=SEED)\n", + " \n", + " # Search over hparams to minimize negative log likelihood. \n", + "# clf = RandomizedSearchCV(\n", + "# rf, params, n_iter=n_iter, scoring=custom_nll_scorer, \n", + "# n_jobs=os.cpu_count(), cv=cv, random_state=SEED,\n", + "# verbose=0\n", + "# )\n", + " \n", + " clf = RandomizedSearchCV(\n", + " rf, params, n_iter=n_iter, scoring='f1_weighted', \n", + " n_jobs=cpu_count(), cv=cv, random_state=SEED,\n", + " verbose=0\n", + " )\n", + " \n", + " X_tr = train.drop(columns=['user_id', 'target'])\n", + " y_tr = train.target.values.ravel()\n", + " \n", + " scorer = clf.fit(X_tr, y_tr)\n", + " \n", + " best_model = scorer.best_estimator_\n", + " \n", + " print(f\"Best val score = {scorer.best_score_}\")\n", + " \n", + " X_te = test.drop(columns=['user_id', 'target'])\n", + " \n", + " # Use the best model to compute F1 on the test set.\n", + " test_f1 = f1_score(y_true=test.target.values, y_pred=best_model.predict(X_te), average='weighted')\n", + " \n", + " print(f\"Test F1 = {test_f1}\")\n", + " \n", + " return best_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fab93ed", + "metadata": {}, + "outputs": [], + "source": [ + "model = estimate_using_model(train, test)" + ] + }, + { + "cell_type": "markdown", + "id": "2988c1b2", + "metadata": {}, + "source": [ + "Interesting! The model is slightly on par with K-Means!" + ] + }, + { + "cell_type": "markdown", + "id": "c6b77353", + "metadata": {}, + "source": [ + "## Experiment 2: Demographics with trip summaries" + ] + }, + { + "cell_type": "markdown", + "id": "bf7753d4", + "metadata": {}, + "source": [ + "Now that we've performed experiments with solely demographic data, let's expand the feature set by including \n", + "trip summary statistics. We would like this approach to do better than the aforementioned baselines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d46ab0f", + "metadata": {}, + "outputs": [], + "source": [ + "demo_plus_trips = get_demographic_data(\n", + " df, \n", + " trip_features=['mph', 'section_duration_argmax', 'section_distance_argmax', 'start_local_dt_hour', 'end_local_dt_hour']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11c1ea2c", + "metadata": {}, + "outputs": [], + "source": [ + "demo_plus_trips.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6159c90a", + "metadata": {}, + "outputs": [], + "source": [ + "train = demo_plus_trips.loc[demo_plus_trips.user_id.isin(TRAIN_USERS), :]\n", + "test = demo_plus_trips.loc[demo_plus_trips.user_id.isin(TEST_USERS), :]\n", + "\n", + "print(train.shape[0], test.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06e85bdd", + "metadata": {}, + "outputs": [], + "source": [ + "for metric in [\n", + " SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS\n", + "]:\n", + " evaluate_using_similarity(test, train, metric, n_clusters=4)" + ] + }, + { + "cell_type": "markdown", + "id": "ba795489", + "metadata": {}, + "source": [ + "Great! Some improvement here and there.\n", + "\n", + "$allCEO$\n", + "```\n", + "Test F1 score using COSINE = 0.32098765432098775\n", + "Test F1 score using EUCLIDEAN = 0.36684303350970027\n", + "Test F1 score using KNN = 0.41269841269841273\n", + "Test F1 score using KMEANS = 0.4877344877344878\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9acd4b0b", + "metadata": {}, + "outputs": [], + "source": [ + "# Now, we try with the model\n", + "estimate_using_model(train, test)" + ] + }, + { + "cell_type": "markdown", + "id": "cd94c548", + "metadata": {}, + "source": [ + "Great! Compared to the previous model, we see definite improvements! I'm sure we can squeeze some more juice out of the models using fancy optimization, but as a baseline, these are good enough.\n", + "\n", + "\n", + "So, to recap:\n", + "$F1_{cosine} = 0.37$, $F1_{euclidean} = 0.33$, $F1_{knn} = 0.3$, $F1_{kmeans} = 0.36$, $F1_{RF} = 0.4215$" + ] + }, + { + "cell_type": "markdown", + "id": "8a8f6491", + "metadata": {}, + "source": [ + "### Different groupings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ce90367", + "metadata": {}, + "outputs": [], + "source": [ + "# trip_features = ['mph', 'section_duration_argmax', 'section_distance_argmax', 'start:hour', 'end:hour']\n", + "\n", + "# for group_mode in ['section_mode_argmax', 'section_distance_argmax', 'section_duration_argmax', 'duration', 'distance']:\n", + " \n", + "# if group_mode in trip_features:\n", + "# _ = trip_features.pop(trip_features.index(group_mode))\n", + " \n", + "# exp_df = get_demographic_data(\n", + "# df, \n", + "# trip_grouping=group_mode,\n", + "# trip_features=trip_features,\n", + "# use_qcut=True\n", + "# )\n", + " \n", + "# train, test = train_test_split(exp_df, test_size=0.2, random_state=SEED)\n", + " \n", + "# for sim in [\n", + "# SimilarityMetric.COSINE, SimilarityMetric.EUCLIDEAN, SimilarityMetric.KNN, SimilarityMetric.KMEANS\n", + "# ]:\n", + "# evaluate_using_similarity(test, train, sim, n_clusters=3)\n", + " \n", + "# # estimate_using_model(train, test, n_iter=200)\n", + " \n", + "# print(50*'=')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d53f945", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "_ = generate_tsne_plots(demo_plus_trips, perplexity=6, n_iter=7500)" + ] + }, + { + "cell_type": "markdown", + "id": "c339fcc6", + "metadata": {}, + "source": [ + "# Multi-level modeling" + ] + }, + { + "cell_type": "markdown", + "id": "213676ec", + "metadata": {}, + "source": [ + "In this approach, we want to piece together the similarity search and modeling processes. Here's a rough sketch of how it should be implemented:\n", + "\n", + "1. For every user in the training set, build a model using their entire trip history.\n", + "2. Consolidate these user-level models in data structure, preferably a dictionary.\n", + "3. Now, when we want to perform inference on a new user with no prior trips, we use the similarity search to get the user ID in the training set who is the most similar to the user in question.\n", + "4. We retrieve the model for this corresponding user and perform an inference. The hypothesis is that since the two users are similar, their trip substitution patterns are also similar." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c48ee430", + "metadata": {}, + "outputs": [], + "source": [ + "def drop_columns(df: pd.DataFrame):\n", + " to_drop = [\n", + " 'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', \n", + " 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', \n", + " 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation',\n", + " 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', \n", + " 'start:day', 'start_local_dt_minute', 'start_local_dt_second', \n", + " 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', \n", + " 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', \n", + " 'end_local_dt_timezone', '_id', 'metadata_write_ts', 'additions', \n", + " 'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', \n", + " 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', \n", + " 'available_modes', 'section_coordinates_argmax', 'section_mode_argmax'\n", + " ]\n", + " \n", + " # Drop section_mode_argmax and available_modes.\n", + " return df.drop(\n", + " columns=to_drop, \n", + " inplace=False\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca9e6e6a", + "metadata": {}, + "outputs": [], + "source": [ + "def construct_model_dictionary(train: pd.DataFrame):\n", + " \n", + " def train_on_user(user_id: str):\n", + " '''\n", + " Given the training set and the user ID to query, filter the dataset and\n", + " retain only the relevant trips. Then, create folds and optimize a model for this user.\n", + " Return the trained model instance.\n", + " '''\n", + " \n", + " user_data = train.loc[train.user_id == user_id, :].reset_index(drop=True)\n", + " \n", + " # Split user trips into train-test folds.\n", + " u_train, u_test = train_test_split(user_data, test_size=0.2, shuffle=True, random_state=SEED)\n", + " \n", + " user_model = estimate_using_model(\n", + " u_train, u_test, \n", + " n_iter=100\n", + " )\n", + " \n", + " return user_model\n", + " \n", + " for user in train.user_id.unique():\n", + " MODEL_DICT[user]['warm_start'] = train_on_user(user)\n", + " print(50*'=')\n", + " \n", + " print(\"\\nDone!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2a035c16", + "metadata": {}, + "source": [ + "## Warm start:\n", + "\n", + "If the queried user has prior trips, we know that we we can harness the additional information. So if we encounter such a user, we will first find the most similar user (using only demographics). Once the most similar user is found, we query the trip model for the user and run inference through it.\n", + "\n", + "## Cold start:\n", + "\n", + "If the queried user has no prior trips, we will use the demo-only model. We first perform a similarity search and then run user inference through the demo-only model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "082c4e39", + "metadata": {}, + "outputs": [], + "source": [ + "class MultiLevelModel:\n", + " def __init__(self, model_dict: Dict, train: pd.DataFrame, test: pd.DataFrame, **model_kwargs):\n", + " \n", + " self._demographics = [\n", + " 'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', \n", + " 'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', \n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', \n", + " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + " 'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', \n", + " 'has_multiple_jobs_No', 'has_multiple_jobs_Prefer not to say', 'has_multiple_jobs_Yes', \n", + " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", + " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + " 'primary_job_type_Full-time', 'primary_job_type_Part-time', 'primary_job_type_Prefer not to say', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', 'primary_job_description_Food service', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', \n", + " 'primary_job_description_Professional, managerial, or technical', \n", + " 'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', \n", + " 'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', \n", + " 'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', \n", + " 'primary_job_commute_mode_WFH', 'is_overnight_trip', 'n_working_residents'\n", + " ]\n", + " \n", + " assert all([c in test.columns for c in self._demographics]), \"[test] Demographic features are missing!\"\n", + " assert all([c in train.columns for c in self._demographics]), \"[train] Demographic features are missing!\"\n", + " \n", + " self._mdict = model_dict\n", + " self._train = train\n", + " self._test = test\n", + " self.metric = model_kwargs.pop('metric', SimilarityMetric.COSINE)\n", + " \n", + " \n", + " def _phase1(self):\n", + " \n", + " tr = self._train.copy()\n", + " te = self._test.copy()\n", + " \n", + " if tr.columns.isin(['user_id', 'target']).sum() == 2:\n", + " tr = tr.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", + " \n", + " if te.columns.isin(['user_id', 'target']).sum() == 2:\n", + " te = te.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", + "\n", + " te_users = self._test.user_id.tolist()\n", + "\n", + " if self.metric == SimilarityMetric.COSINE:\n", + "\n", + " sim = cosine_similarity(te.values, tr.values)\n", + "\n", + " # Compute the argmax across the train set.\n", + " argmax = np.argmax(sim, axis=1)\n", + "\n", + " # Retrieve the user_id at these indices.\n", + " train_users = self._train.loc[argmax, 'user_id']\n", + "\n", + " elif self.metric == SimilarityMetric.EUCLIDEAN:\n", + "\n", + " sim = euclidean_distances(te.values, tr.values)\n", + "\n", + " # Compute the argmin here!\n", + " argmin = np.argmin(sim, axis=1)\n", + "\n", + " # Retrieve the train user_ids.\n", + " train_users = self._train.loc[argmin, 'user_id']\n", + "\n", + " return pd.DataFrame({'test_user_id': te_users, 'train_user_id': train_users})\n", + " \n", + " \n", + " def _phase2(self, sim_df: pd.DataFrame, cold_start: bool):\n", + " \n", + " prediction_df = list()\n", + " \n", + " # Now, we use the sim_df to run inference based on whether \n", + " for ix, row in sim_df.iterrows():\n", + " train_user = row['train_user_id']\n", + " \n", + " # Retrieve the appropriate model.\n", + " user_models = self._mdict.get(train_user, None)\n", + " \n", + " start_type = 'cold_start' if cold_start else 'warm_start'\n", + " \n", + " # which specific model?\n", + " sp_model = user_models.get(start_type, None)\n", + " \n", + " # Now get the test user data.\n", + " test_user = row['test_user_id']\n", + " \n", + " if cold_start:\n", + " test_data = self._test.loc[self._test.user_id == test_user, self._demographics]\n", + " test_data = test_data.iloc[0, :]\n", + " else:\n", + " test_data = self._test.loc[self._test.user_id == test_user, :]\n", + " \n", + " predictions = sp_model.predict(test_data)\n", + " \n", + " print(f\"test: [{test_user}], predictions: {predictions}\")\n", + " \n", + " \n", + " def execute_pipeline(self, cold_start: bool = False):\n", + " # For each test user, get the most similar train user.\n", + " sim_df = self._phase1()\n", + " \n", + " predictions = self._phase2(sim_df, cold_start)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb63632d", + "metadata": {}, + "outputs": [], + "source": [ + "# FULL DATA.\n", + "train = df.loc[df.user_id.isin(TRAIN_USERS), :]\n", + "test = df.loc[df.user_id.isin(TEST_USERS), :]\n", + "\n", + "train_counts = train.user_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2528eaa", + "metadata": {}, + "outputs": [], + "source": [ + "## We only want to train on users who have a good number of trips.\n", + "good_users = train_counts[train_counts >= 100].index\n", + "\n", + "bad_users = train_counts[train_counts < 100].index\n", + "\n", + "print(f\"Number of users filtered out of training: {len(bad_users)}\")\n", + "\n", + "filtered_train = train.loc[train.user_id.isin(good_users), :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bae55b21", + "metadata": {}, + "outputs": [], + "source": [ + "# Full data.\n", + "\n", + "train_df = drop_columns(filtered_train)\n", + "test_df = drop_columns(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88d0e2d2", + "metadata": {}, + "outputs": [], + "source": [ + "print(train_df.shape, test_df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37febd6d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model_dict = construct_model_dictionary(train_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1249925", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/replacement_mode_modeling/04_FeatureClustering.ipynb b/replacement_mode_modeling/04_FeatureClustering.ipynb new file mode 100644 index 0000000..094d84c --- /dev/null +++ b/replacement_mode_modeling/04_FeatureClustering.ipynb @@ -0,0 +1,1108 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "789df947", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import os\n", + "import itertools\n", + "import pickle\n", + "import ast\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "import seaborn as sns\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances\n", + "from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from typing import List, Dict, Union\n", + "from pandas.api.types import is_numeric_dtype\n", + "from sklearn.cluster import DBSCAN, KMeans\n", + "from collections import Counter\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "\n", + "%matplotlib inline\n", + "\n", + "SEED = 13210\n", + "\n", + "np.random.seed(SEED)\n", + "random.seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea4dda7", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_SOURCES = [\n", + " ('./data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv', 'ride2own'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv', 'masscec'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr')\n", + "]\n", + "\n", + "# Switch between 0-4\n", + "DB_NUMBER = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ef3275", + "metadata": {}, + "outputs": [], + "source": [ + "# Change this name to something unique\n", + "CURRENT_DB = DATA_SOURCES[DB_NUMBER][1]\n", + "PATH = DATA_SOURCES[DB_NUMBER][0]\n", + "\n", + "df = pd.read_csv(PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0d884a3", + "metadata": {}, + "outputs": [], + "source": [ + "df.target.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2281bdc", + "metadata": {}, + "outputs": [], + "source": [ + "df.rename(\n", + " columns={'end_local_dt_hour': 'end:hour', 'start_local_dt_hour': 'start:hour', 'replaced_mode': 'target'}, \n", + " inplace=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c22d6ac", + "metadata": {}, + "outputs": [], + "source": [ + "TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']\n", + "MAP = {ix+1: t for (ix, t) in enumerate(TARGETS)}\n", + "TARGET_MAP = {v:k for k, v in MAP.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "063f6124", + "metadata": {}, + "outputs": [], + "source": [ + "df.replace({'target': TARGET_MAP}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cef8d45b", + "metadata": {}, + "outputs": [], + "source": [ + "# % of trips per mode.\n", + "trip_percents = df.groupby(['user_id'])['section_mode_argmax'].apply(lambda x: x.value_counts(normalize=True)).unstack(level=-1)\n", + "trip_percents.fillna(0., inplace=True)\n", + "\n", + "trip_percents.columns = ['coverage_'+x for x in trip_percents.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c6af2d", + "metadata": {}, + "outputs": [], + "source": [ + "n_trips = pd.DataFrame(df.groupby('user_id').size(), columns=['n_trips'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eff378a7", + "metadata": {}, + "outputs": [], + "source": [ + "most_common_start = df.groupby('user_id')['start:hour'].apply(lambda x: x.value_counts().idxmax())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cffbd401", + "metadata": {}, + "outputs": [], + "source": [ + "most_common_end = df.groupby('user_id')['end:hour'].apply(lambda x: x.value_counts().idxmax())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1eb1633", + "metadata": {}, + "outputs": [], + "source": [ + "# % of distance in each primary sensed mode.\n", + "total_distance = df.groupby(['user_id', 'section_mode_argmax'])['section_distance_argmax'].sum().unstack(level=-1)\n", + "total_distance = total_distance.div(total_distance.sum(axis=1), axis=0)\n", + "total_distance.fillna(0., inplace=True)\n", + "total_distance.columns = ['pct_distance_' + x for x in total_distance.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9cc0a0f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "figure1_df = trip_percents.merge(right=total_distance, left_index=True, right_index=True).merge(\n", + " right=n_trips, left_index=True, right_index=True\n", + ").merge(\n", + " right=most_common_start, left_index=True, right_index=True\n", + ").merge(right=most_common_end, left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "750fbd0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize the last three columns.\n", + "\n", + "def min_max_normalize(col: pd.Series):\n", + " _max, _min = col.max(), col.min()\n", + " return pd.Series((col - _min)/(_max - _min))\n", + "\n", + "figure1_df['n_trips'] = min_max_normalize(figure1_df['n_trips'])\n", + "figure1_df['start:hour'] = np.sin(figure1_df['start:hour'].values)\n", + "figure1_df['end:hour'] = np.sin(figure1_df['end:hour'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c3d1849", + "metadata": {}, + "outputs": [], + "source": [ + "figure1_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "598d82bc", + "metadata": {}, + "outputs": [], + "source": [ + "epsilons = np.linspace(1e-3, 1., 1000)\n", + "\n", + "best_eps = -np.inf\n", + "best_score = -np.inf\n", + "\n", + "for eps in epsilons:\n", + " model = DBSCAN(eps=eps).fit(figure1_df)\n", + " \n", + " if len(np.unique(model.labels_)) < 2:\n", + " continue\n", + " \n", + " score = silhouette_score(figure1_df, model.labels_)\n", + " if score > best_score:\n", + " best_eps = eps\n", + " best_score = score\n", + "\n", + "print(best_eps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc89a42d", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "AlLCEO: eps=0.542\n", + "durham: eps=0.661\n", + "masscec: eps=0.64\n", + "'''\n", + "\n", + "clustering = DBSCAN(eps=0.8).fit(figure1_df)\n", + "\n", + "print(Counter(clustering.labels_))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c9a7c4", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# After clustering, we would like to see what the replaced mode argmax distribution in each cluster is.\n", + "\n", + "labels = clustering.labels_\n", + "\n", + "for cix in np.unique(labels):\n", + " cluster_users = figure1_df.iloc[labels == cix,:].index\n", + " \n", + " print(f\"{len(cluster_users)} users in cluster {cix}\")\n", + " \n", + " # Now, for each user, look at the actual data and determine the replaced mode argmax distribution.\n", + " sub_df = df.loc[df.user_id.isin(cluster_users), :].reset_index(drop=True)\n", + " \n", + " sub_df['target'] = sub_df['target'].apply(lambda x: MAP[x])\n", + " \n", + " rm_argmax = sub_df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())\n", + " fig, ax = plt.subplots()\n", + " rm_argmax.hist(ax=ax)\n", + " ax.set_title(f\"Replaced mode argmax distribution for users in cluster {cix}\")\n", + " ax.set_xlabel(\"Target\")\n", + " \n", + " plt.savefig(f'./outputs/{CURRENT_DB}__FIG1_cluster_{cix}_target_dist.png', dpi=300)\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2e8e117", + "metadata": {}, + "outputs": [], + "source": [ + "user_target_pct = pd.DataFrame()\n", + "\n", + "# For every user, compute the replaced mode distribution.\n", + "for user_id, user_data in df.groupby('user_id'):\n", + " \n", + " target_distribution = user_data['target'].value_counts(normalize=True)\n", + " target_distribution.rename(index=MAP, inplace=True)\n", + " user_target_pct = pd.concat([user_target_pct, target_distribution.to_frame(user_id).T])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99369dba", + "metadata": {}, + "outputs": [], + "source": [ + "user_target_pct.columns = ['pct_trips_' + str(x) for x in user_target_pct.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cca3671", + "metadata": {}, + "outputs": [], + "source": [ + "target_distance = pd.DataFrame()\n", + "\n", + "# For every user, compute the replaced mode distribution.\n", + "for user_id, user_data in df.groupby('user_id'):\n", + " \n", + " # total_distance = user_data['distance'].sum()\n", + " distance_per_target = user_data.groupby('target')['section_distance_argmax'].sum()\n", + " distance_per_target.rename(index=MAP, inplace=True)\n", + " row = distance_per_target.to_frame(user_id).T\n", + " target_distance = pd.concat([target_distance, row])\n", + " \n", + "target_distance.columns = ['distance_' + str(x) for x in target_distance.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18093734", + "metadata": {}, + "outputs": [], + "source": [ + "target_duration = df.groupby(['user_id', 'target'])['section_duration_argmax'].sum().unstack()\n", + "target_duration.rename(columns=MAP, inplace=True)\n", + "target_duration.fillna(0., inplace=True)\n", + "target_duration.columns = ['duration_' + str(x) for x in target_duration.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8001a140", + "metadata": {}, + "outputs": [], + "source": [ + "target_df = user_target_pct.merge(right=target_distance, left_index=True, right_index=True).merge(\n", + " right=target_duration, left_index=True, right_index=True\n", + ")\n", + "\n", + "target_df.fillna(0., inplace=True)\n", + "\n", + "target_df = pd.DataFrame(\n", + " MinMaxScaler().fit_transform(target_df),\n", + " columns=target_df.columns,\n", + " index=target_df.index\n", + ")\n", + "\n", + "display(target_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31fecc00", + "metadata": {}, + "outputs": [], + "source": [ + "epsilons = np.linspace(5e-3, 1., 1500)\n", + "best_score = -np.inf\n", + "best_eps = None\n", + "best_n = None\n", + "# alpha = 0.7\n", + "beta = 0.05\n", + "\n", + "for eps in epsilons:\n", + " for n in range(2, 30):\n", + " labels = DBSCAN(eps=eps, min_samples=n).fit(target_df).labels_\n", + " \n", + " n_unique = np.unique(labels)\n", + " n_outliers = len(labels[labels == -1])\n", + " \n", + " if n_outliers == len(labels) or len(n_unique) < 2:\n", + " continue\n", + " \n", + " # Encourage more clustering and discourage more outliers.\n", + " score = silhouette_score(target_df, labels) + (len(labels) - n_outliers)/n_outliers\n", + " \n", + " if score > best_score:\n", + " best_score = score\n", + " best_eps = eps\n", + " best_n = n\n", + "\n", + "print(f\"{best_score=}, {best_n=}, {best_eps=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e39b41ba", + "metadata": {}, + "outputs": [], + "source": [ + "# 0.35 is a good value\n", + "\n", + "'''\n", + "allCEO = DBSCAN(eps=0.52, min_samples=2)\n", + "durham: DBSCAN(eps=best_eps, min_samples=2)\n", + "masscec: min_samples=2, eps=0.986724482988659\n", + "'''\n", + "\n", + "cl2 = DBSCAN(eps=best_eps, min_samples=2).fit(target_df)\n", + "# cl2 = KMeans(n_clusters=5).fit(target_df)\n", + "\n", + "Counter(cl2.labels_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dbf8763", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "tsfm = PCA(n_components=2).fit_transform(target_df)\n", + "\n", + "fig, ax = plt.subplots()\n", + "sns.scatterplot(x=tsfm[:,0], y=tsfm[:,1], c=cl2.labels_)\n", + "ax.set(xlabel='Latent Dim 0', ylabel='Latent Dim 1')\n", + "plt.savefig(f'./outputs/{CURRENT_DB}__Fig2__PCA_w_colors.png', dpi=300)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e444316", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0bc09b9", + "metadata": {}, + "outputs": [], + "source": [ + "# Per-cluster users.\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.neighbors import LocalOutlierFactor\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "\n", + "demographic_cols = {\n", + " 'Stage_database': [\n", + " 'has_drivers_license', 'is_student', 'is_paid', \n", + " 'income_category', 'n_residence_members', 'n_residents_u18', 'n_residents_with_license', \n", + " 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', \n", + " 'n_working_residents', \"highest_education_Bachelor's degree\", \n", + " 'highest_education_Graduate degree or professional degree', \n", + " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', 'primary_job_description_Food service', \n", + " 'primary_job_description_Linecook', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Non-profit program manager', \n", + " 'primary_job_description_Other', 'primary_job_description_Professional, managerial, or technical', \n", + " 'primary_job_description_Sales or service', 'primary_job_description_Self employed', \n", + " 'primary_job_description_food service', 'gender_Man', 'gender_Nonbinary/genderqueer/genderfluid', \n", + " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail', 'av_unknown', 'av_walk', 'av_car', \n", + " 'av_s_car'\n", + " ] + [c for c in df.columns if 'age' in c],\n", + " 'durham': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', \n", + " 'n_residents_u18', 'n_residence_members', 'income_category',\n", + " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', \n", + " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_graduate_degree_or_professional_degree', \n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate', \n", + " 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Other', 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', \n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman', \n", + " 'av_walk', 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car', 'av_ridehail', \n", + " 'av_s_micro', 'av_s_car'\n", + " ] + [c for c in df.columns if 'age' in c],\n", + " 'masscec': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", + " 'income_category', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', \n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_graduate_degree_or_professional_degree', \n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate', \n", + " 'highest_education_prefer_not_to_say', 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', \n", + " 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', 'gender_prefer_not_to_say', 'gender_woman', \n", + " 'av_p_micro', 'av_s_car', 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown', \n", + " 'av_ridehail', 'av_walk'\n", + " ] + [c for c in df.columns if 'age' in c],\n", + "}\n", + "\n", + "\n", + "cluster_labels = cl2.labels_\n", + "demographics = df.groupby('user_id').first()[demographic_cols[CURRENT_DB]]\n", + "demographics = demographics.loc[target_df.index, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a3c6355", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "### DEMOGRAPHICS\n", + "\n", + "def entropy(x):\n", + " # Compute bincount, normalize over the entire size. Gives us probabilities.\n", + " p = np.unique(x, return_counts=True)[1]/len(x)\n", + " # Compute the enropy usnig the probabilities.\n", + " return -np.sum(p * np.log2(p))\n", + "\n", + "def preprocess_demo_data(df: pd.DataFrame):\n", + " return df\n", + "\n", + "\n", + "within_cluster_homogeneity = dict()\n", + "other_cluster_homogeneity = dict()\n", + "labels = cl2.labels_\n", + "\n", + "for cix in np.unique(labels):\n", + " within_cluster_homogeneity[cix] = dict()\n", + " users = target_df[labels == cix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + " \n", + " for col in processed.columns:\n", + " # Numeric/ordinal values. Use std. to measure homogeneity.\n", + " if col in [\n", + " 'n_residence_members', 'n_residents_u18', 'n_working_residents', 'n_motor_vehicles',\n", + " 'n_residents_with_license', 'income_category'\n", + " ]:\n", + " within_cluster_homogeneity[cix][col] = processed[col].std()\n", + " else:\n", + " within_cluster_homogeneity[cix][col] = entropy(processed[col])\n", + "\n", + "# Compute average homogeneity across other clusters.\n", + "for cix in within_cluster_homogeneity.keys():\n", + " other_cluster_homogeneity[cix] = dict()\n", + " other_clusters = set(within_cluster_homogeneity.keys()) - set([cix])\n", + " for feature in within_cluster_homogeneity[cix].keys():\n", + " homogeneity_in_others = [within_cluster_homogeneity[x][feature] for x in other_clusters]\n", + " other_cluster_homogeneity[cix][feature] = np.mean(homogeneity_in_others)\n", + "\n", + " \n", + "# Compute contrastive homogeneity\n", + "# CH = homogeneity within cluster / average homogeneity across other clusters\n", + "for cix in within_cluster_homogeneity.keys():\n", + " ch_scores = list()\n", + " print(f\"For cluster {cix}:\")\n", + " for feature in within_cluster_homogeneity[cix].keys():\n", + " feature_ch = within_cluster_homogeneity[cix][feature]/(other_cluster_homogeneity[cix][feature] + 1e-6)\n", + " ch_scores.append((feature, feature_ch))\n", + " \n", + " ch_df = pd.DataFrame(ch_scores, columns=['feature', 'ch']).sort_values(by=['ch']).head(4)\n", + " \n", + " # Display actual values.\n", + " users = target_df[labels == cix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + " \n", + " display(ch_df)\n", + " print()\n", + " filtered = processed.loc[:, processed.columns.isin(ch_df.feature)][ch_df.feature]\n", + " filtered_features = ch_df.feature.tolist()\n", + " \n", + " fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", + " for i, a in enumerate(ax.flatten()):\n", + " sns.histplot(filtered[filtered_features[i]], ax=a, stat=\"percent\")\n", + " plt.tight_layout()\n", + " plt.savefig(f\"{CURRENT_DB}_{cix}_Demographic_consistency.png\", dpi=300)\n", + " plt.show()\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "580bbd86", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import iqr\n", + "\n", + "def get_trip_summary_df(users, df):\n", + " '''\n", + " 1. df = a huge dataframe of user-trips. Each row is a trip.\n", + " 2. every trip is divided into sections: [walk, transit, walk]\n", + " 3. Each section has a corresponding distance and duration: [m1, m2, m3], [t1, t2, t3], [d1, d2, d3]\n", + " 4. What we are doing is only considering the mode, distance, and duration of the section with the largest distance\n", + " '''\n", + " \n", + " costs = [c for c in df.columns if 'av_' in c]\n", + " \n", + " mode_coverage = df.groupby(['user_id', 'section_mode_argmax'])[\n", + " ['section_duration_argmax', 'section_distance_argmax', 'mph'] + costs\n", + " ].agg(['mean', 'median']).unstack()\n", + " \n", + " global_stats = df.groupby('user_id')[['duration', 'distance']].agg(\n", + " ['mean', 'median']\n", + " )\n", + "\n", + " mode_coverage.columns = mode_coverage.columns.map('_'.join)\n", + " global_stats.columns = global_stats.columns.map('_'.join)\n", + " \n", + " # return mode_coverage\n", + " return mode_coverage.merge(right=global_stats, left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92ad2485", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "## TRIP SUMMARIES\n", + "\n", + "# Per-cluster users.\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.neighbors import LocalOutlierFactor\n", + "from sklearn.feature_selection import SelectKBest, mutual_info_classif\n", + "\n", + "labels = cl2.labels_\n", + "\n", + "def get_data(cix):\n", + " users = target_df.iloc[labels == cix, :].index\n", + " \n", + " # Compute trip summaries.\n", + " X = df.loc[df.user_id.isin(users), [\n", + " 'section_distance_argmax', 'duration', 'distance', 'section_mode_argmax',\n", + " 'section_duration_argmax', 'mph', 'target', 'user_id'\n", + " ] + [c for c in df.columns if 'cost_' in c]].reset_index(drop=True)\n", + " \n", + " # Compute the target distribution and select the argmax.\n", + " target_distribution = X.target.value_counts(ascending=False, normalize=True)\n", + " target_distribution.rename(index=MAP, inplace=True)\n", + " \n", + " # Caution - this summary df has NaNs. Use nanstd() to compute nan-aware std.\n", + " subset = get_trip_summary_df(users, X)\n", + " \n", + " norm_subset = pd.DataFrame(\n", + " MinMaxScaler().fit_transform(subset),\n", + " columns=subset.columns, index=subset.index\n", + " )\n", + " \n", + " return norm_subset, target_distribution\n", + "\n", + "\n", + "in_cluster_homogeneity = dict()\n", + "out_cluster_homogeneity = dict()\n", + "\n", + "for cluster_ix in np.unique(labels):\n", + " in_cluster_homogeneity[cluster_ix] = dict()\n", + " norm_subset, _ = get_data(cluster_ix)\n", + " for feature in norm_subset.columns:\n", + " in_cluster_homogeneity[cluster_ix][feature] = np.nanstd(norm_subset[feature])\n", + "\n", + "for cix in in_cluster_homogeneity.keys():\n", + " out_cluster_homogeneity[cix] = dict()\n", + " oix = set(labels) - set([cix])\n", + " for feature in norm_subset.columns:\n", + " out_cluster_homogeneity[cix][feature] = np.nanmean([in_cluster_homogeneity[x].get(feature, np.nan) for x in oix])\n", + "\n", + "# Now, compute the per-cluster homogeneity.\n", + "for cix in in_cluster_homogeneity.keys():\n", + " ch = list()\n", + " for feature in in_cluster_homogeneity[cix].keys():\n", + " if feature in in_cluster_homogeneity[cix] and feature in out_cluster_homogeneity[cix]:\n", + " ratio = in_cluster_homogeneity[cix][feature] / (out_cluster_homogeneity[cix][feature] + 1e-6)\n", + " ch.append([feature, ratio])\n", + " \n", + " ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).head(4)\n", + " data, target_dist = get_data(cix)\n", + " \n", + " features = ch_df.feature.tolist()\n", + " \n", + " print(f\"For cluster {cix}:\")\n", + " display(target_dist)\n", + " display(ch_df)\n", + " \n", + " fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", + " for i, a in enumerate(ax.flatten()):\n", + " sns.histplot(data[features[i]], ax=a, stat=\"percent\")\n", + " plt.tight_layout()\n", + " plt.savefig(f\"{CURRENT_DB}_{cix}_Trip_consistency.png\", dpi=300)\n", + " plt.show()\n", + " print()\n", + " \n", + " print(50*'=')" + ] + }, + { + "cell_type": "markdown", + "id": "4992ff45", + "metadata": {}, + "source": [ + "## Now check the combined homogeneity score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8723e3d", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "ic, oc = dict(), dict()\n", + "\n", + "labels = cl2.labels_\n", + "TOP_K = 3\n", + "\n", + "\n", + "for cix in np.unique(labels):\n", + " ic[cix] = dict()\n", + " \n", + " # Trip characteristics.\n", + " norm_subset, _ = get_data(cix)\n", + " for feature in norm_subset.columns:\n", + " ic[cix][feature] = np.nanstd(norm_subset[feature])\n", + " \n", + " # Demographics.\n", + " users = target_df[labels == cix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + " \n", + " for col in processed.columns:\n", + " # Numeric/ordinal values. Use std. to measure homogeneity.\n", + " if col in [\n", + " 'n_residence_members', 'n_residents_u18', 'n_working_residents', 'n_motor_vehicles',\n", + " 'n_residents_with_license', 'income_category'\n", + " ]:\n", + " ic[cix][col] = np.nanstd(processed[col])\n", + " else:\n", + " ic[cix][col] = entropy(processed[col])\n", + "\n", + "for cix in ic.keys():\n", + " oc[cix] = dict()\n", + " oix = set(labels) - set([cix])\n", + " for feature in ic[cix].keys():\n", + " oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n", + "\n", + "per_cluster_most_homogeneous = dict()\n", + "\n", + "# Now, compute the per-cluster homogeneity.\n", + "ax_ix = 0\n", + "for cix in ic.keys():\n", + "\n", + " print(f\"For cluster {cix}:\")\n", + "\n", + " # For each, cluster, we will have (TOP_K x n_clusters) figures.\n", + " fig, ax = plt.subplots(nrows=TOP_K, ncols=len(ic.keys()), figsize=(12, 8))\n", + "\n", + " other_ix = set(ic.keys()) - set([cix])\n", + " \n", + " ch = list()\n", + " for feature in ic[cix].keys():\n", + " if feature in oc[cix]:\n", + " ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", + " ch.append([feature, ratio])\n", + " \n", + " # Just the top k.\n", + " ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).reset_index(drop=True).head(TOP_K)\n", + "\n", + " figure_data = dict()\n", + " \n", + " # Get the actual trip summary data.\n", + " trip_summary_data, target_dist = get_data(cix)\n", + " \n", + " # Get the actual demographic data.\n", + " users = target_df[labels == cix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + "\n", + " # Left-most subplot will be that of the current cluster's feature.\n", + " for row_ix, row in ch_df.iterrows():\n", + " if row.feature in trip_summary_data.columns:\n", + " sns.histplot(trip_summary_data[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + " else:\n", + " sns.histplot(processed[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + " ax[row_ix][0].set_xlabel(ax[row_ix][0].get_xlabel(), fontsize=8)\n", + " ax[row_ix][0].set_ylim(0., 100.)\n", + "\n", + " offset_col_ix = 1\n", + " ## Now, others.\n", + " for oix in other_ix:\n", + " # Get the actual trip summary data.\n", + " other_summary_data, _ = get_data(oix)\n", + " \n", + " # Get the actual demographic data.\n", + " users = target_df[labels == oix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " other_demo = preprocess_demo_data(data)\n", + "\n", + " for row_ix, row in ch_df.iterrows():\n", + " if row.feature in other_summary_data.columns:\n", + " sns.histplot(other_summary_data[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + " else:\n", + " sns.histplot(other_demo[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + " ax[row_ix][offset_col_ix].set_xlabel(ax[row_ix][offset_col_ix].get_xlabel(), fontsize=8)\n", + " ax[row_ix][offset_col_ix].set_ylim(0., 100.)\n", + " \n", + " offset_col_ix += 1\n", + " \n", + " plt.tight_layout()\n", + " plt.savefig(f\"./outputs/{CURRENT_DB}_cluster{cix}_combined_features.png\", dpi=300)\n", + " plt.show()\n", + " print(50 * '=')" + ] + }, + { + "cell_type": "markdown", + "id": "24a80f68", + "metadata": {}, + "source": [ + "## Try a different clustering technique? (Unexplored)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0288db8", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import AffinityPropagation\n", + "\n", + "best_score = -np.inf\n", + "best_params = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b14ad0c", + "metadata": {}, + "outputs": [], + "source": [ + "cls = AffinityPropagation(random_state=13210).fit(target_df)\n", + "labels = cls.labels_\n", + "\n", + "print(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2562bbb6-66eb-4283-8c08-6e20a0b2ade5", + "metadata": {}, + "outputs": [], + "source": [ + "center_embeddings = cls.cluster_centers_\n", + "centers_proj = PCA(n_components=2).fit_transform(center_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7aad38a", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "sns.scatterplot(x=tsfm[:,0], y=tsfm[:,1], c=cls.labels_, ax=ax)\n", + "ax.scatter(x=centers_proj[:,0], y=centers_proj[:,1], marker='X', c='red', alpha=0.5)\n", + "ax.set(xlabel='Latent Dim 0', ylabel='Latent Dim 1')\n", + "# plt.legend([str(x) for x in ap_labels], loc='best')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39ce0238-b3f2-4f46-a52f-13e3160cc52f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data2(cix, labels):\n", + " users = target_df.iloc[labels == cix, :].index\n", + " \n", + " # Compute trip summaries.\n", + " X = df.loc[df.user_id.isin(users), [\n", + " 'section_distance_argmax', 'section_duration_argmax',\n", + " 'section_mode_argmax', 'distance',\n", + " 'duration', 'mph', 'user_id', 'target'\n", + " ]]\n", + " \n", + " # Compute the target distribution and select the argmax.\n", + " target_distribution = X.target.value_counts(ascending=False, normalize=True)\n", + " target_distribution.rename(index=MAP, inplace=True)\n", + " \n", + " # Caution - this summary df has NaNs. Use nanstd() to compute nan-aware std.\n", + " subset = get_trip_summary_df(users, X)\n", + " \n", + " norm_subset = pd.DataFrame(\n", + " MinMaxScaler().fit_transform(subset),\n", + " columns=subset.columns, index=subset.index\n", + " )\n", + " \n", + " return norm_subset, target_distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec27cf29", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "## Analaysis for this data.\n", + "\n", + "ic, oc = dict(), dict()\n", + "labels = cls.labels_\n", + "\n", + "for cix in np.unique(labels):\n", + " users = target_df[labels == cix].index\n", + " \n", + " ic[cix] = dict()\n", + " \n", + " # Trip characteristics.\n", + " norm_subset, _ = get_data2(cix, labels)\n", + " for feature in norm_subset.columns:\n", + " ic[cix][feature] = np.nanstd(norm_subset[feature])\n", + " \n", + " # Demographics.\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + " \n", + " for col in processed.columns:\n", + " # Numeric/ordinal values. Use std. to measure homogeneity.\n", + " if col == 'age' or col == 'income_category' or col == 'n_working_residents':\n", + " ic[cix][col] = np.nanstd(processed[col])\n", + " else:\n", + " ic[cix][col] = entropy(processed[col])\n", + "\n", + "for cix in ic.keys():\n", + " oc[cix] = dict()\n", + " oix = set(labels) - set([cix])\n", + " for feature in ic[cix].keys():\n", + " oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n", + "\n", + "# # Now, compute the per-cluster homogeneity.\n", + "# for cix in ic.keys():\n", + " \n", + "# users = users = target_df[labels == cix].index\n", + "# norm_subset, target_dist = get_data(cix, labels)\n", + "# data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + "# processed = preprocess_demo_data(data)\n", + " \n", + "# concat = processed.merge(norm_subset, left_index=True, right_index=True)\n", + " \n", + "# ch = list()\n", + "# for feature in ic[cix].keys():\n", + "# ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", + "# ch.append([feature, ratio])\n", + " \n", + "# ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).head(TOP_K).reset_index(drop=True)\n", + "\n", + "\n", + "# Now, compute the per-cluster homogeneity.\n", + "ax_ix = 0\n", + "for cix in ic.keys():\n", + "\n", + " print(f\"For cluster {cix}:\")\n", + "\n", + " # For each, cluster, we will have (TOP_K x n_clusters) figures.\n", + " fig, ax = plt.subplots(nrows=5, ncols=len(ic.keys()), figsize=(12, 8))\n", + "\n", + " other_ix = set(ic.keys()) - set([cix])\n", + " \n", + " ch = list()\n", + " for feature in ic[cix].keys():\n", + " ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", + " ch.append([feature, ratio])\n", + " \n", + " # Just the top k.\n", + " ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).reset_index(drop=True).head(5)\n", + " figure_data = dict()\n", + " \n", + " # Get the actual trip summary data.\n", + " trip_summary_data, target_dist = get_data(cix)\n", + "\n", + " display(target_dist)\n", + " \n", + " # Get the actual demographic data.\n", + " users = target_df[labels == cix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " processed = preprocess_demo_data(data)\n", + "\n", + " # Left-most subplot will be that of the current cluster's feature.\n", + " for row_ix, row in ch_df.iterrows():\n", + " if row.feature in trip_summary_data.columns:\n", + " sns.histplot(trip_summary_data[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + " else:\n", + " sns.histplot(processed[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + " ax[row_ix][0].set_xlabel(ax[row_ix][0].get_xlabel(), fontsize=6)\n", + " ax[row_ix][0].set_ylim(0., 100.)\n", + "\n", + " offset_col_ix = 1\n", + " ## Now, others.\n", + " for oix in other_ix:\n", + " # Get the actual trip summary data.\n", + " other_summary_data, _ = get_data(oix)\n", + " \n", + " # Get the actual demographic data.\n", + " users = target_df[labels == oix].index\n", + " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + " other_demo = preprocess_demo_data(data)\n", + "\n", + " for row_ix, row in ch_df.iterrows():\n", + " if row.feature in other_summary_data.columns:\n", + " sns.histplot(other_summary_data[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + " else:\n", + " sns.histplot(other_demo[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + " ax[row_ix][offset_col_ix].set_xlabel(ax[row_ix][offset_col_ix].get_xlabel(), fontsize=6)\n", + " ax[row_ix][offset_col_ix].set_ylim(0., 100.)\n", + " \n", + " offset_col_ix += 1\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + " print(50 * '=')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b642db", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/replacement_mode_modeling/README.md b/replacement_mode_modeling/README.md new file mode 100644 index 0000000..999722d --- /dev/null +++ b/replacement_mode_modeling/README.md @@ -0,0 +1,31 @@ +# Efforts towards predicting the replaced mode without user labels + +## Prerequisites: +- These experiments were conducted on top of the `emission` anaconda environment. Please ensure that this environment is available to you before re-running the code. +- In addition, the script uses `seaborn` for plotting and `pandarallel` for parallel pandas processing. +- Ensure you have the following data sources loaded in your MongoDB Docker container: + - Stage_database (All CEO) + - Durham + - Masscec + - Ride2own + - UPRM NICR +- Once these data sources are procured and loaded in your Mongo container, you will need to add the inferred sections to the data. To do this, please run the [add_sections_and_summaries_to_trips.py](https://github.com/e-mission/e-mission-server/blob/master/bin/historical/migrations/add_sections_and_summaries_to_trips.py) script. **NOTE**: If you see a lot of errors in the log, try to re-run the script by modifying the following line from: + +```language=python +# Before +eps.dispatch(split_lists, skip_if_no_new_data=False, target_fn=add_sections_to_trips) + +# After +eps.dispatch(split_lists, skip_if_no_new_data=False, target_fn=None) +``` + +This will trigger the intake pipeline for the current db and add the inferred section. + +- Note 2: The script above did not work for the All CEO data for me. Therefore, I obtained the section durations using the `get_section_durations` method I've written in `scaffolding.py` (you do not have to call this method, it is already handled in the notebooks). Please note that running this script takes a long time and it is advised to cache the generated output. + +## Running the experiments +The order in which the experiments are to be run are denoted by the preceding number. The following is a brief summary about each notebook: +1. `01_extract_db_data.ipynb`: This notebook extracts the data, performs the necessary preprocessing, updates availability indicators, computes cost estimates, and stores the preprocessed data in `data/filtered_trips`. +2. `02_run_trip_level_models.py`: This script reads all the preprocessed data, fits trip-level models with different stratitifications, generates the outputs, and stores them in `outputs/benchmark_results/`. +3. `03_user_level_models.ipynb`: This notebook explores user fingerprints, similarity searching, and naive user-level models. +4. `04_FeatureClustering.ipynb`: This notebook performs two functions: (a) Cluster users based on demographics/trip feature summaries and check for target distributions across clusters, and (b) Cluster users by grouping w.r.t. the target and checking for feature homogeneity within clusters diff --git a/replacement_mode_modeling/data/README.md b/replacement_mode_modeling/data/README.md new file mode 100644 index 0000000..6d2c55c --- /dev/null +++ b/replacement_mode_modeling/data/README.md @@ -0,0 +1 @@ +Temporary folder \ No newline at end of file diff --git a/replacement_mode_modeling/outputs/README.md b/replacement_mode_modeling/outputs/README.md new file mode 100644 index 0000000..6d2c55c --- /dev/null +++ b/replacement_mode_modeling/outputs/README.md @@ -0,0 +1 @@ +Temporary folder \ No newline at end of file From a1a4ef7b30cb6e0459830686e5271199e6ff4c60 Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Mon, 29 Apr 2024 14:46:04 -0400 Subject: [PATCH 2/6] Updated README with specific package versions --- replacement_mode_modeling/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/replacement_mode_modeling/README.md b/replacement_mode_modeling/README.md index 999722d..d6f3ee7 100644 --- a/replacement_mode_modeling/README.md +++ b/replacement_mode_modeling/README.md @@ -1,8 +1,16 @@ + # Efforts towards predicting the replaced mode without user labels ## Prerequisites: - These experiments were conducted on top of the `emission` anaconda environment. Please ensure that this environment is available to you before re-running the code. -- In addition, the script uses `seaborn` for plotting and `pandarallel` for parallel pandas processing. +- In addition, some notebooks use `seaborn` for plotting and `pandarallel` for parallel pandas processing. The packages can be installed in the following manner: + +``` +(After activating emission conda env) +pip3 install pandarallel==1.6.5 +pip3 install seaborn==0.12.2 +``` + - Ensure you have the following data sources loaded in your MongoDB Docker container: - Stage_database (All CEO) - Durham From 31eb26136b919a95c67a14a506645800d1e8ef3a Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Mon, 29 Apr 2024 14:59:56 -0400 Subject: [PATCH 3/6] Removed scaffolding dependency; updated README --- .../01_extract_db_data.ipynb | 122 ++++++++++++------ replacement_mode_modeling/README.md | 5 +- 2 files changed, 84 insertions(+), 43 deletions(-) diff --git a/replacement_mode_modeling/01_extract_db_data.ipynb b/replacement_mode_modeling/01_extract_db_data.ipynb index 216b88b..1706837 100644 --- a/replacement_mode_modeling/01_extract_db_data.ipynb +++ b/replacement_mode_modeling/01_extract_db_data.ipynb @@ -206,10 +206,6 @@ " }\n", "}\n", "\n", - "SENSED_SECTION_DICT = {\n", - " \"openpath_prod_mm_masscec\": {'AIR_OR_HSR', 'BICYCLING', 'BUS', 'CAR', 'LIGHT_RAIL', 'SUBWAY', 'TRAIN', 'UNKNOWN', 'WALKING'}\n", - "}\n", - "\n", "SURVEY_DATA_DICT = {\n", " \"Stage_database\": {\n", " \"Unique User ID (auto-filled, do not edit)\": \"user_id\",\n", @@ -477,28 +473,26 @@ "metadata": {}, "outputs": [], "source": [ - "if CURRENT_DB != \"Stage_database\":\n", - "\n", - " ## Source: scaffolding.py\n", + "## Source: scaffolding.py\n", "\n", - " uuid_df = pd.json_normalize(list(edb.get_uuid_db().find()))\n", + "uuid_df = pd.json_normalize(list(edb.get_uuid_db().find()))\n", "\n", - " if not INCLUDE_TEST_USERS:\n", - " uuid_df = uuid_df.loc[~uuid_df.user_email.str.contains('_test_'), :]\n", + "if not INCLUDE_TEST_USERS:\n", + " uuid_df = uuid_df.loc[~uuid_df.user_email.str.contains('_test_'), :]\n", "\n", - " filtered = uuid_df.uuid.unique()\n", + "filtered = uuid_df.uuid.unique()\n", "\n", - " agg = esta.TimeSeries.get_aggregate_time_series()\n", - " all_ct = agg.get_data_df(\"analysis/confirmed_trip\", None)\n", + "agg = esta.TimeSeries.get_aggregate_time_series()\n", + "all_ct = agg.get_data_df(\"analysis/confirmed_trip\", None)\n", "\n", - " print(f\"Before filtering, length={len(all_ct)}\")\n", - " participant_ct_df = all_ct.loc[all_ct.user_id.isin(filtered), :]\n", - " print(f\"After filtering, length={len(participant_ct_df)}\")\n", + "print(f\"Before filtering, length={len(all_ct)}\")\n", + "participant_ct_df = all_ct.loc[all_ct.user_id.isin(filtered), :]\n", + "print(f\"After filtering, length={len(participant_ct_df)}\")\n", "\n", - " expanded_ct = expand_userinputs(participant_ct_df)\n", - " expanded_ct = data_quality_check(expanded_ct)\n", - " print(expanded_ct.columns.tolist())\n", - " expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unknown')" + "expanded_ct = expand_userinputs(participant_ct_df)\n", + "expanded_ct = data_quality_check(expanded_ct)\n", + "print(expanded_ct.columns.tolist())\n", + "expanded_ct['replaced_mode'] = expanded_ct['replaced_mode'].fillna('Unknown')" ] }, { @@ -510,27 +504,25 @@ "source": [ "# # Additional preprocessing for replaced mode (if any)\n", "\n", - "if CURRENT_DB != \"Stage_database\":\n", + "mode_counts = expanded_ct['replaced_mode'].value_counts()\n", + "drop_modes = mode_counts[mode_counts == 1].index.tolist()\n", "\n", - " mode_counts = expanded_ct['replaced_mode'].value_counts()\n", - " drop_modes = mode_counts[mode_counts == 1].index.tolist()\n", + "expanded_ct.drop(\n", + " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(drop_modes)].index,\n", + " inplace=True\n", + ")\n", "\n", - " expanded_ct.drop(\n", - " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(drop_modes)].index,\n", - " inplace=True\n", - " )\n", + "# Additional modes to drop.\n", + "expanded_ct.drop(\n", + " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(\n", + " # Remove all rows with air, boat, or weird answers.\n", + " ['houseboat', 'gondola', 'airline_flight', 'aircraft', 'zoo', 'air',\n", + " 'airplane', 'boat', 'flight', 'plane', 'meal', 'lunch']\n", + " )].index,\n", + " inplace=True\n", + ")\n", "\n", - " # Additional modes to drop.\n", - " expanded_ct.drop(\n", - " index=expanded_ct.loc[expanded_ct.replaced_mode.isin(\n", - " # Remove all rows with air, boat, or weird answers.\n", - " ['houseboat', 'gondola', 'airline_flight', 'aircraft', 'zoo', 'air',\n", - " 'airplane', 'boat', 'flight', 'plane', 'meal', 'lunch']\n", - " )].index,\n", - " inplace=True\n", - " )\n", - " \n", - " expanded_ct.replaced_mode = expanded_ct.replaced_mode.apply(lambda x: REPLACED_MODE_DICT[CURRENT_DB][x])" + "expanded_ct.replaced_mode = expanded_ct.replaced_mode.apply(lambda x: REPLACED_MODE_DICT[CURRENT_DB][x])" ] }, { @@ -590,10 +582,58 @@ " survey_data = pd.concat([survey_data, v], axis=0, ignore_index=True)\n", "else:\n", " # Read the demographics.\n", - " survey_data = pd.read_csv('./viz_scripts/Can Do Colorado eBike Program - en.csv')\n", + " # Ensure that you have access to this survey file and that it is placed in the given destination.\n", + " survey_data = pd.read_csv('../viz_scripts/Can Do Colorado eBike Program - en.csv')\n", " survey_data.rename(columns={'Unique User ID (auto-filled, do not edit)': 'user_id'}, inplace=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aaedf66", + "metadata": {}, + "outputs": [], + "source": [ + "def get_section_durations(confirmed_trips: pd.DataFrame):\n", + " \n", + " import pandarallel\n", + "\n", + " # Initialize the parallel processing.\n", + " pandarallel.initialize(progress_bar=False)\n", + "\n", + " \"\"\"\n", + " Extract section-wise durations from trips for every trips.\n", + " \"\"\"\n", + "\n", + " # the inner function has access to these variables.\n", + " primary_key = 'analysis/inferred_section'\n", + " fallback_key = 'analysis/cleaned_section'\n", + "\n", + " def get_durations(user_id, trip_id):\n", + "\n", + " inferred_sections = esdt.get_sections_for_trip(key = primary_key,\n", + " user_id = user_id, trip_id = trip_id)\n", + "\n", + " if inferred_sections and len(inferred_sections) > 0:\n", + " return [x.data.duration for x in inferred_sections]\n", + " \n", + " print(\"Falling back to confirmed trips...\")\n", + "\n", + " cleaned_sections = esdt.get_sections_for_trip(key = fallback_key,\n", + " user_id = user_id, trip_id = trip_id)\n", + " \n", + " if cleaned_sections and len(cleaned_sections) > 0:\n", + " return [x.data.duration for x in cleaned_sections]\n", + "\n", + " return []\n", + "\n", + " confirmed_trips['section_durations'] = confirmed_trips.parallel_apply(\n", + " lambda x: get_durations(x.user_id, x.cleaned_trip), axis=1\n", + " )\n", + "\n", + " return confirmed_trips" + ] + }, { "cell_type": "code", "execution_count": null, @@ -611,9 +651,7 @@ " else:\n", " ## NOTE: Run this cell only if the cached CSV is not already available. It will take a LOT of time.\n", " ## Benchmark timing: ~12 hours on a MacBook Pro (2017 model) with pandarallel, 4 workers.\n", - " \n", - " importlib.reload(scaffolding)\n", - " expanded_ct = scaffolding.get_section_durations(expanded_ct)\n", + " expanded_ct = get_section_durations(expanded_ct)\n", " expanded_ct.to_csv('./data/cached_allceo_data.csv', index=False)" ] }, diff --git a/replacement_mode_modeling/README.md b/replacement_mode_modeling/README.md index d6f3ee7..628fd43 100644 --- a/replacement_mode_modeling/README.md +++ b/replacement_mode_modeling/README.md @@ -17,6 +17,9 @@ pip3 install seaborn==0.12.2 - Masscec - Ride2own - UPRM NICR + +- Additionally, please also procure the CanBikeCO survey CSV file and place it in the `viz_scripts/` directory. + - Once these data sources are procured and loaded in your Mongo container, you will need to add the inferred sections to the data. To do this, please run the [add_sections_and_summaries_to_trips.py](https://github.com/e-mission/e-mission-server/blob/master/bin/historical/migrations/add_sections_and_summaries_to_trips.py) script. **NOTE**: If you see a lot of errors in the log, try to re-run the script by modifying the following line from: ```language=python @@ -29,7 +32,7 @@ eps.dispatch(split_lists, skip_if_no_new_data=False, target_fn=None) This will trigger the intake pipeline for the current db and add the inferred section. -- Note 2: The script above did not work for the All CEO data for me. Therefore, I obtained the section durations using the `get_section_durations` method I've written in `scaffolding.py` (you do not have to call this method, it is already handled in the notebooks). Please note that running this script takes a long time and it is advised to cache the generated output. +- Note 2: The script above did not work for the All CEO data for me. Therefore, I obtained the section durations using the `get_section_durations` method I've written in the first notebook. Please note that running this script takes a long time and it is advised to cache the generated output. ## Running the experiments The order in which the experiments are to be run are denoted by the preceding number. The following is a brief summary about each notebook: From 1644db0343a0f56fddb13fed9b52331e98e7977b Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Tue, 30 Apr 2024 18:33:23 -0400 Subject: [PATCH 4/6] Updated bugs in notebooks --- .../01_extract_db_data.ipynb | 68 +- .../03_user_level_models.ipynb | 497 ++++-- .../04_FeatureClustering.ipynb | 1505 ++++++++++++++--- 3 files changed, 1597 insertions(+), 473 deletions(-) diff --git a/replacement_mode_modeling/01_extract_db_data.ipynb b/replacement_mode_modeling/01_extract_db_data.ipynb index 1706837..bef2545 100644 --- a/replacement_mode_modeling/01_extract_db_data.ipynb +++ b/replacement_mode_modeling/01_extract_db_data.ipynb @@ -43,11 +43,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Add path to your emission server here.\n", - "emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'\n", - "sys.path.append(str(emission_path))\n", + "# Add path to your emission server here. Uncommented because the notebooks are run in the server.\n", + "# If running locally, you need to point this to the e-mission server repo.\n", + "# emission_path = Path(os.getcwd()).parent.parent.parent / 'my_emission_server' / 'e-mission-server'\n", + "# sys.path.append(str(emission_path))\n", "\n", - "# Also add the home (viz_scripts) to the path\n", + "# # Also add the home (viz_scripts) to the path\n", "sys.path.append('../viz_scripts')" ] }, @@ -58,7 +59,6 @@ "metadata": {}, "outputs": [], "source": [ - "import scaffolding\n", "import emission.core.get_database as edb\n", "import emission.storage.timeseries.abstract_timeseries as esta" ] @@ -75,7 +75,6 @@ " \"openpath_prod_durham\", # Has composite trips\n", " \"openpath_prod_mm_masscec\", # Has composite trips\n", " \"openpath_prod_ride2own\", # Has composite trips\n", - "# \"openpath_prod_uprm_civic\", # No replaced mode (Excluded)\n", " \"openpath_prod_uprm_nicr\" # Has composite trips\n", "]" ] @@ -590,7 +589,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1aaedf66", + "id": "07922a00", "metadata": {}, "outputs": [], "source": [ @@ -683,16 +682,6 @@ "### Demographic data preprocessing" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "336508c2", - "metadata": {}, - "outputs": [], - "source": [ - "print(survey_data.columns.tolist())" - ] - }, { "cell_type": "code", "execution_count": null, @@ -714,9 +703,11 @@ "survey_data.loc[\n", " survey_data.n_motor_vehicles.isin(\n", " ['prefer_not_to_say', 'Prefer not to say / Prefiero no decir.']\n", - " ), 'n_motor_vehicles'\n", - "] = 0\n", - "survey_data.loc[survey_data.n_motor_vehicles.isin(['more_than_3', '4+', 'more_than_4']), 'n_motor_vehicles'] = 4\n", + " ), 'n_motor_vehicles'] = 0\n", + "\n", + "survey_data.loc[survey_data.n_motor_vehicles.isin(\n", + " ['more_than_3', '4+', 'more_than_4', 'more_than_3']\n", + "), 'n_motor_vehicles'] = 4\n", "survey_data.n_motor_vehicles = survey_data.n_motor_vehicles.astype(int)\n", "\n", "# gtg\n", @@ -724,22 +715,20 @@ " lambda x: 1 if str(x).lower() == 'yes' else 0\n", ")\n", "\n", - "survey_data.loc[survey_data.n_residents_u18 == 'prefer_not_to_say'] = 0\n", + "survey_data.loc[survey_data.n_residents_u18 == 'prefer_not_to_say', 'n_residents_u18'] = 0\n", "survey_data.n_residents_u18 = survey_data.n_residents_u18.astype(int)\n", "\n", - "survey_data.loc[survey_data.n_residence_members == 'prefer_not_to_say'] = 0\n", + "survey_data.loc[survey_data.n_residence_members == 'prefer_not_to_say', 'n_residence_members'] = 0\n", "survey_data.n_residence_members = survey_data.n_residence_members.astype(int)\n", "\n", "survey_data.loc[survey_data.n_residents_with_license == 'prefer_not_to_say'] = 0\n", - "survey_data.loc[survey_data.n_residents_with_license == 'more_than_4'] = 4\n", + "survey_data.loc[survey_data.n_residents_with_license == 'more_than_4', 'n_residents_with_license'] = 4\n", "survey_data.n_residents_with_license = survey_data.n_residents_with_license.astype(int)\n", "\n", - "# In allCEO, we see 50 & 9999. What??\n", + "# Handle abnormal inputs.\n", "survey_data = survey_data[\n", - " (survey_data.n_residence_members < 10) & (survey_data.n_residents_u18 < 10) & \n", - " (survey_data.n_residents_with_license < 10) & \n", - " (survey_data.n_residence_members - survey_data.n_residents_with_license > 0) &\n", - " (survey_data.n_residence_members - survey_data.n_residents_u18 > 0)\n", + " (survey_data.n_residence_members - survey_data.n_residents_with_license >= 0) &\n", + " (survey_data.n_residence_members - survey_data.n_residents_u18 >= 0)\n", "].reset_index(drop=True)\n", "\n", "# gtg\n", @@ -837,7 +826,7 @@ " \"professional__managerial__or_technical\": \"Professional, Manegerial, or Technical\",\n", " \"manufacturing__construction__maintenance\": \"Manufacturing, construction, maintenance, or farming\",\n", " \"clerical_or_administrative_support\": \"Clerical or administrative support\",\n", - " \"prefer_not_to_say\": \"Prefer not to say\",\n", + " \"prefer_not_to_say\": \"Prefer not to say\"\n", " }\n", " \n", " df.primary_job_description = df.primary_job_description.apply(\n", @@ -1616,7 +1605,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Done processing for {CURRENT_DB=}\")" + "print(f\"Done processing for {CURRENT_DB=}, Number of unique users: {len(filtered_trips.user_id.unique())}\")" ] }, { @@ -1633,16 +1622,6 @@ "filtered_trips.replace({'target': {t: ix+1 for ix, t in enumerate(targets)}}, inplace=True)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "50d3eaec", - "metadata": {}, - "outputs": [], - "source": [ - "display(filtered_trips.target.unique())" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1650,6 +1629,7 @@ "metadata": {}, "outputs": [], "source": [ + "# savepath = Path('./data/filtered_data')\n", "savepath = Path('./data/filtered_data')\n", "\n", "if not savepath.exists():\n", @@ -1657,6 +1637,14 @@ "\n", "filtered_trips.to_csv(savepath / f'preprocessed_data_{CURRENT_DB}.csv', index=False)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f16fb354", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/replacement_mode_modeling/03_user_level_models.ipynb b/replacement_mode_modeling/03_user_level_models.ipynb index da06468..616cd5e 100644 --- a/replacement_mode_modeling/03_user_level_models.ipynb +++ b/replacement_mode_modeling/03_user_level_models.ipynb @@ -83,29 +83,61 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('./data/filtered_data/preprocessed_data_Stage_database.csv')\n", - "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv')\n", - "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv')\n", - "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv')\n", - "# df = pd.read_csv('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv')" + "DATA_SOURCE = [\n", + " ('./data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv', 'masscec'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv', 'ride2own'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr')\n", + "]" ] }, { "cell_type": "code", "execution_count": null, - "id": "915e9d6f", + "id": "e3d9c5bd", "metadata": {}, "outputs": [], "source": [ - "df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax()).unique()" + "## CHANGE THE DB INDEX HERE.\n", + "DB_NUMBER = 0\n", + "\n", + "PATH = DATA_SOURCE[DB_NUMBER][0]\n", + "CURRENT_DB = DATA_SOURCE[DB_NUMBER][1]" ] }, { "cell_type": "code", "execution_count": null, - "id": "72793473", + "id": "e37f8922", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bfa6843", "metadata": {}, "outputs": [], + "source": [ + "not_needed = ['deprecatedID', 'data.key']\n", + "\n", + "for col in not_needed:\n", + " if col in df.columns:\n", + " df.drop(columns=[col], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72793473", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "print(df.columns.tolist())" ] @@ -276,30 +308,124 @@ " trip_features_to_use = trip_kwargs.pop('trip_features', None)\n", " trip_group_key = trip_kwargs.pop('trip_grouping', 'section_mode_argmax')\n", " \n", - " demographics = [ \n", - " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", - " 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles',\n", - " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'n_working_residents', \n", - " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", - " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", - " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", - " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", - " 'primary_job_description_Education', 'primary_job_description_Food service', \n", - " 'primary_job_description_Linecook', \n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", - " 'primary_job_description_Medical/healthcare', 'primary_job_description_Non-profit program manager', \n", - " 'primary_job_description_Other', 'primary_job_description_Professional, managerial, or technical', \n", - " 'primary_job_description_Sales or service', 'primary_job_description_Self employed', \n", - " 'primary_job_description_food service', 'gender_Man', 'gender_Nonbinary/genderqueer/genderfluid', \n", - " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", - " 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', 'age_31___35_years_old', \n", - " 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old', 'age_51___55_years_old', \n", - " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old', 'av_transit', 'av_no_trip', \n", - " 'av_p_micro', 'av_s_micro', 'av_ridehail', 'av_unknown', 'av_walk', 'av_car', 'av_s_car', \n", - " ]\n", + " demographics = {\n", + " 'allceo': [\n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category',\n", + " 'n_residence_members', 'n_residents_u18', 'n_residents_with_license',\n", + " 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs', 'n_working_residents',\n", + " \"highest_education_Bachelor's degree\",\n", + " 'highest_education_Graduate degree or professional degree',\n", + " 'highest_education_High school graduate or GED',\n", + " 'highest_education_Less than a high school graduate',\n", + " 'highest_education_Prefer not to say',\n", + " 'highest_education_Some college or associates degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Custodial',\n", + " 'primary_job_description_Education',\n", + " 'primary_job_description_Food service',\n", + " 'primary_job_description_Linecook',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Medical/healthcare',\n", + " 'primary_job_description_Non-profit program manager',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, managerial, or technical',\n", + " 'primary_job_description_Sales or service',\n", + " 'primary_job_description_Self employed',\n", + " 'primary_job_description_food service', 'gender_Man',\n", + " 'gender_Nonbinary/genderqueer/genderfluid', 'gender_Prefer not to say',\n", + " 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", + " 'age_16___20_years_old', 'age_21___25_years_old',\n", + " 'age_26___30_years_old', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_46___50_years_old', 'age_51___55_years_old',\n", + " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old',\n", + " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail',\n", + " 'av_unknown', 'av_walk', 'av_car', 'av_s_car'\n", + " ],\n", + " 'durham': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18',\n", + " 'n_residence_members', 'income_category',\n", + " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles',\n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs',\n", + " 'highest_education_bachelor_s_degree',\n", + " 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man',\n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman',\n", + " 'age_16___20_years_old', 'age_21___25_years_old',\n", + " 'age_26___30_years_old', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'av_walk',\n", + " 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car',\n", + " 'av_ridehail', 'av_s_micro', 'av_s_car'\n", + " ],\n", + " 'nicr': [\n", + " 'is_student', 'is_paid',\n", + " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", + " 'income_category', 'n_residents_with_license',\n", + " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_prefer_not_to_say', 'primary_job_description_Other',\n", + " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'av_p_micro',\n", + " 'av_car', 'av_transit', 'av_ridehail', 'av_no_trip', 'av_s_car',\n", + " 'av_s_micro', 'av_unknown', 'av_walk'\n", + " ],\n", + " 'masscec': [\n", + " 'is_student', 'is_paid',\n", + " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", + " 'income_category', 'n_residents_with_license',\n", + " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree',\n", + " 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_prefer_not_to_say',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Prefer not to say',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man',\n", + " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old',\n", + " 'age_21___25_years_old', 'age_26___30_years_old',\n", + " 'age_31___35_years_old', 'age_36___40_years_old',\n", + " 'age_41___45_years_old', 'age_46___50_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old',\n", + " 'age_61___65_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car',\n", + " 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown',\n", + " 'av_ridehail', 'av_walk'\n", + " ],\n", + " 'ride2own': [\n", + " 'has_drivers_license', 'is_student',\n", + " 'is_paid', 'income_category', 'n_residence_members',\n", + " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license',\n", + " 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs',\n", + " 'highest_education_bachelor_s_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'gender_man', 'gender_woman', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_51___55_years_old', 'av_no_trip', 'av_s_micro', 'av_transit',\n", + " 'av_car', 'av_ridehail', 'av_p_micro', 'av_s_car', 'av_walk',\n", + " 'av_unknown'\n", + " ]\n", + " }\n", " \n", " # Retain only the first instance of each user and subset the columns.\n", - " filtered = df.groupby('user_id').first()[demographics]\n", + " filtered = df.groupby('user_id').first()[demographics[CURRENT_DB]]\n", " \n", " # Get the targets.\n", " targets = df.groupby('user_id')['target'].apply(lambda x: x.value_counts().idxmax())\n", @@ -315,10 +441,6 @@ " # Reaching here means that we need to include trip summaries\n", " # -----------------------------------------------------------\n", " \n", - " # If trip summaries are to be used, then re-use the preprocessed availability features.\n", - " availability = df[['user_id'] + [c for c in df.columns if 'av_' in c]]\n", - " availability = availability.groupby('user_id').first()\n", - " \n", " # For every user, generate the global trip-level summaries.\n", " global_aggs = df.groupby('user_id').agg({'duration': 'mean', 'distance': 'mean'})\n", " \n", @@ -339,7 +461,6 @@ " trip_features = trip_features.merge(right=global_aggs, left_index=True, right_index=True)\n", " \n", " # Finally, join with availability indicators and targets.\n", - " trip_features = trip_features.merge(right=availability, left_index=True, right_on='user_id')\n", " trip_features = trip_features.merge(right=targets, left_index=True, right_index=True)\n", " \n", " return trip_features.reset_index(drop=False)" @@ -387,12 +508,12 @@ "outputs": [], "source": [ "tsne_kwargs = {\n", - " 'perplexity': 6,\n", + " 'perplexity': min(len(demo_df)-1, 6),\n", " 'n_iter': 7500,\n", " 'metric': 'cosine'\n", "}\n", "\n", - "## PLOT BY THE WAY IN WHICH PEOPLE USE THE SAME REPLACED MODE AND CHECK THE SIMILARITY.\n", + "# ## PLOT BY THE WAY IN WHICH PEOPLE USE THE SAME REPLACED MODE AND CHECK THE SIMILARITY.\n", "\n", "projections = generate_tsne_plots(demo_df, **tsne_kwargs)" ] @@ -482,9 +603,14 @@ " \n", " elif metric == SimilarityMetric.KNN:\n", " \n", + " n_neighbors = metric_kwargs.pop('n_neighbors', 3)\n", + " \n", + " if n_neighbors >= len(tr):\n", + " return -1.\n", + " \n", " # Build the KNN classifier. By default, let it be 3.\n", " knn = KNeighborsClassifier(\n", - " n_neighbors=metric_kwargs.pop('n_neighbors', 3),\n", + " n_neighbors=n_neighbors,\n", " weights='distance',\n", " metric=metric_kwargs.pop('knn_metric', 'cosine'),\n", " n_jobs=os.cpu_count()\n", @@ -497,9 +623,14 @@ " \n", " elif metric == SimilarityMetric.KMEANS:\n", " \n", + " n_clusters = metric_kwargs.pop('n_clusters', 8)\n", + " \n", + " if n_clusters >= len(tr):\n", + " return -1\n", + " \n", " # Build the model.\n", " kmeans = KMeans(\n", - " n_clusters=metric_kwargs.pop('n_clusters', 8),\n", + " n_clusters=n_clusters,\n", " max_iter=metric_kwargs.pop('max_iter', 300),\n", " n_init='auto',\n", " random_state=SEED\n", @@ -632,6 +763,14 @@ " return best_model" ] }, + { + "cell_type": "markdown", + "id": "45fef6d1", + "metadata": {}, + "source": [ + "### Uncomment to run the model " + ] + }, { "cell_type": "code", "execution_count": null, @@ -639,7 +778,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = estimate_using_model(train, test)" + "# model = estimate_using_model(train, test)" ] }, { @@ -677,7 +816,9 @@ "demo_plus_trips = get_demographic_data(\n", " df, \n", " trip_features=['mph', 'section_duration_argmax', 'section_distance_argmax', 'start_local_dt_hour', 'end_local_dt_hour']\n", - ")" + ")\n", + "\n", + "demo_plus_trips.fillna(0., inplace=True)" ] }, { @@ -732,6 +873,14 @@ "```" ] }, + { + "cell_type": "markdown", + "id": "85483fc4", + "metadata": {}, + "source": [ + "### Uncomment this to run the model" + ] + }, { "cell_type": "code", "execution_count": null, @@ -740,7 +889,7 @@ "outputs": [], "source": [ "# Now, we try with the model\n", - "estimate_using_model(train, test)" + "# estimate_using_model(train, test)" ] }, { @@ -805,7 +954,11 @@ }, "outputs": [], "source": [ - "_ = generate_tsne_plots(demo_plus_trips, perplexity=6, n_iter=7500)" + "_ = generate_tsne_plots(\n", + " demo_plus_trips, \n", + " perplexity=min(len(demo_plus_trips)-1, 6), \n", + " n_iter=7500\n", + ")" ] }, { @@ -813,7 +966,9 @@ "id": "c339fcc6", "metadata": {}, "source": [ - "# Multi-level modeling" + "# (Experimental) Multi-level modeling\n", + "\n", + "## The code below onwards is not tested." ] }, { @@ -836,26 +991,26 @@ "metadata": {}, "outputs": [], "source": [ - "def drop_columns(df: pd.DataFrame):\n", - " to_drop = [\n", - " 'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', \n", - " 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', \n", - " 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation',\n", - " 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', \n", - " 'start:day', 'start_local_dt_minute', 'start_local_dt_second', \n", - " 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', \n", - " 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', \n", - " 'end_local_dt_timezone', '_id', 'metadata_write_ts', 'additions', \n", - " 'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', \n", - " 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', \n", - " 'available_modes', 'section_coordinates_argmax', 'section_mode_argmax'\n", - " ]\n", - " \n", - " # Drop section_mode_argmax and available_modes.\n", - " return df.drop(\n", - " columns=to_drop, \n", - " inplace=False\n", - " )" + "# def drop_columns(df: pd.DataFrame):\n", + "# to_drop = [\n", + "# 'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', \n", + "# 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', \n", + "# 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation',\n", + "# 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', \n", + "# 'start:day', 'start_local_dt_minute', 'start_local_dt_second', \n", + "# 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', \n", + "# 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', \n", + "# 'end_local_dt_timezone', '_id', 'metadata_write_ts', 'additions', \n", + "# 'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', \n", + "# 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', \n", + "# 'available_modes', 'section_coordinates_argmax', 'section_mode_argmax'\n", + "# ]\n", + " \n", + "# # Drop section_mode_argmax and available_modes.\n", + "# return df.drop(\n", + "# columns=to_drop, \n", + "# inplace=False\n", + "# )" ] }, { @@ -865,32 +1020,32 @@ "metadata": {}, "outputs": [], "source": [ - "def construct_model_dictionary(train: pd.DataFrame):\n", - " \n", - " def train_on_user(user_id: str):\n", - " '''\n", - " Given the training set and the user ID to query, filter the dataset and\n", - " retain only the relevant trips. Then, create folds and optimize a model for this user.\n", - " Return the trained model instance.\n", - " '''\n", + "# def construct_model_dictionary(train: pd.DataFrame):\n", + " \n", + "# def train_on_user(user_id: str):\n", + "# '''\n", + "# Given the training set and the user ID to query, filter the dataset and\n", + "# retain only the relevant trips. Then, create folds and optimize a model for this user.\n", + "# Return the trained model instance.\n", + "# '''\n", " \n", - " user_data = train.loc[train.user_id == user_id, :].reset_index(drop=True)\n", + "# user_data = train.loc[train.user_id == user_id, :].reset_index(drop=True)\n", " \n", - " # Split user trips into train-test folds.\n", - " u_train, u_test = train_test_split(user_data, test_size=0.2, shuffle=True, random_state=SEED)\n", + "# # Split user trips into train-test folds.\n", + "# u_train, u_test = train_test_split(user_data, test_size=0.2, shuffle=True, random_state=SEED)\n", " \n", - " user_model = estimate_using_model(\n", - " u_train, u_test, \n", - " n_iter=100\n", - " )\n", + "# user_model = estimate_using_model(\n", + "# u_train, u_test, \n", + "# n_iter=100\n", + "# )\n", " \n", - " return user_model\n", + "# return user_model\n", " \n", - " for user in train.user_id.unique():\n", - " MODEL_DICT[user]['warm_start'] = train_on_user(user)\n", - " print(50*'=')\n", + "# for user in train.user_id.unique():\n", + "# MODEL_DICT[user]['warm_start'] = train_on_user(user)\n", + "# print(50*'=')\n", " \n", - " print(\"\\nDone!\")" + "# print(\"\\nDone!\")" ] }, { @@ -914,111 +1069,111 @@ "metadata": {}, "outputs": [], "source": [ - "class MultiLevelModel:\n", - " def __init__(self, model_dict: Dict, train: pd.DataFrame, test: pd.DataFrame, **model_kwargs):\n", + "# class MultiLevelModel:\n", + "# def __init__(self, model_dict: Dict, train: pd.DataFrame, test: pd.DataFrame, **model_kwargs):\n", " \n", - " self._demographics = [\n", - " 'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', \n", - " 'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', \n", - " 'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', \n", - " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", - " 'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', \n", - " 'has_multiple_jobs_No', 'has_multiple_jobs_Prefer not to say', 'has_multiple_jobs_Yes', \n", - " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", - " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", - " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", - " 'primary_job_type_Full-time', 'primary_job_type_Part-time', 'primary_job_type_Prefer not to say', \n", - " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", - " 'primary_job_description_Education', 'primary_job_description_Food service', \n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", - " 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', \n", - " 'primary_job_description_Professional, managerial, or technical', \n", - " 'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', \n", - " 'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', \n", - " 'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', \n", - " 'primary_job_commute_mode_WFH', 'is_overnight_trip', 'n_working_residents'\n", - " ]\n", + "# self._demographics = [\n", + "# 'primary_job_commute_time', 'income_category', 'n_residence_members', 'n_residents_u18', \n", + "# 'n_residents_with_license', 'n_motor_vehicles', 'available_modes', 'age', 'gender_Man', \n", + "# 'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', \n", + "# 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + "# 'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', \n", + "# 'has_multiple_jobs_No', 'has_multiple_jobs_Prefer not to say', 'has_multiple_jobs_Yes', \n", + "# \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", + "# 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + "# 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + "# 'primary_job_type_Full-time', 'primary_job_type_Part-time', 'primary_job_type_Prefer not to say', \n", + "# 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + "# 'primary_job_description_Education', 'primary_job_description_Food service', \n", + "# 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + "# 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', \n", + "# 'primary_job_description_Professional, managerial, or technical', \n", + "# 'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', \n", + "# 'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', \n", + "# 'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', \n", + "# 'primary_job_commute_mode_WFH', 'is_overnight_trip', 'n_working_residents'\n", + "# ]\n", " \n", - " assert all([c in test.columns for c in self._demographics]), \"[test] Demographic features are missing!\"\n", - " assert all([c in train.columns for c in self._demographics]), \"[train] Demographic features are missing!\"\n", + "# assert all([c in test.columns for c in self._demographics]), \"[test] Demographic features are missing!\"\n", + "# assert all([c in train.columns for c in self._demographics]), \"[train] Demographic features are missing!\"\n", " \n", - " self._mdict = model_dict\n", - " self._train = train\n", - " self._test = test\n", - " self.metric = model_kwargs.pop('metric', SimilarityMetric.COSINE)\n", + "# self._mdict = model_dict\n", + "# self._train = train\n", + "# self._test = test\n", + "# self.metric = model_kwargs.pop('metric', SimilarityMetric.COSINE)\n", " \n", " \n", - " def _phase1(self):\n", + "# def _phase1(self):\n", " \n", - " tr = self._train.copy()\n", - " te = self._test.copy()\n", + "# tr = self._train.copy()\n", + "# te = self._test.copy()\n", " \n", - " if tr.columns.isin(['user_id', 'target']).sum() == 2:\n", - " tr = tr.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", + "# if tr.columns.isin(['user_id', 'target']).sum() == 2:\n", + "# tr = tr.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", " \n", - " if te.columns.isin(['user_id', 'target']).sum() == 2:\n", - " te = te.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", + "# if te.columns.isin(['user_id', 'target']).sum() == 2:\n", + "# te = te.drop(columns=['user_id', 'target']).reset_index(drop=True)\n", "\n", - " te_users = self._test.user_id.tolist()\n", + "# te_users = self._test.user_id.tolist()\n", "\n", - " if self.metric == SimilarityMetric.COSINE:\n", + "# if self.metric == SimilarityMetric.COSINE:\n", "\n", - " sim = cosine_similarity(te.values, tr.values)\n", + "# sim = cosine_similarity(te.values, tr.values)\n", "\n", - " # Compute the argmax across the train set.\n", - " argmax = np.argmax(sim, axis=1)\n", + "# # Compute the argmax across the train set.\n", + "# argmax = np.argmax(sim, axis=1)\n", "\n", - " # Retrieve the user_id at these indices.\n", - " train_users = self._train.loc[argmax, 'user_id']\n", + "# # Retrieve the user_id at these indices.\n", + "# train_users = self._train.loc[argmax, 'user_id']\n", "\n", - " elif self.metric == SimilarityMetric.EUCLIDEAN:\n", + "# elif self.metric == SimilarityMetric.EUCLIDEAN:\n", "\n", - " sim = euclidean_distances(te.values, tr.values)\n", + "# sim = euclidean_distances(te.values, tr.values)\n", "\n", - " # Compute the argmin here!\n", - " argmin = np.argmin(sim, axis=1)\n", + "# # Compute the argmin here!\n", + "# argmin = np.argmin(sim, axis=1)\n", "\n", - " # Retrieve the train user_ids.\n", - " train_users = self._train.loc[argmin, 'user_id']\n", + "# # Retrieve the train user_ids.\n", + "# train_users = self._train.loc[argmin, 'user_id']\n", "\n", - " return pd.DataFrame({'test_user_id': te_users, 'train_user_id': train_users})\n", + "# return pd.DataFrame({'test_user_id': te_users, 'train_user_id': train_users})\n", " \n", " \n", - " def _phase2(self, sim_df: pd.DataFrame, cold_start: bool):\n", + "# def _phase2(self, sim_df: pd.DataFrame, cold_start: bool):\n", " \n", - " prediction_df = list()\n", + "# prediction_df = list()\n", " \n", - " # Now, we use the sim_df to run inference based on whether \n", - " for ix, row in sim_df.iterrows():\n", - " train_user = row['train_user_id']\n", + "# # Now, we use the sim_df to run inference based on whether \n", + "# for ix, row in sim_df.iterrows():\n", + "# train_user = row['train_user_id']\n", " \n", - " # Retrieve the appropriate model.\n", - " user_models = self._mdict.get(train_user, None)\n", + "# # Retrieve the appropriate model.\n", + "# user_models = self._mdict.get(train_user, None)\n", " \n", - " start_type = 'cold_start' if cold_start else 'warm_start'\n", + "# start_type = 'cold_start' if cold_start else 'warm_start'\n", " \n", - " # which specific model?\n", - " sp_model = user_models.get(start_type, None)\n", + "# # which specific model?\n", + "# sp_model = user_models.get(start_type, None)\n", " \n", - " # Now get the test user data.\n", - " test_user = row['test_user_id']\n", + "# # Now get the test user data.\n", + "# test_user = row['test_user_id']\n", " \n", - " if cold_start:\n", - " test_data = self._test.loc[self._test.user_id == test_user, self._demographics]\n", - " test_data = test_data.iloc[0, :]\n", - " else:\n", - " test_data = self._test.loc[self._test.user_id == test_user, :]\n", + "# if cold_start:\n", + "# test_data = self._test.loc[self._test.user_id == test_user, self._demographics]\n", + "# test_data = test_data.iloc[0, :]\n", + "# else:\n", + "# test_data = self._test.loc[self._test.user_id == test_user, :]\n", " \n", - " predictions = sp_model.predict(test_data)\n", + "# predictions = sp_model.predict(test_data)\n", " \n", - " print(f\"test: [{test_user}], predictions: {predictions}\")\n", + "# print(f\"test: [{test_user}], predictions: {predictions}\")\n", " \n", " \n", - " def execute_pipeline(self, cold_start: bool = False):\n", - " # For each test user, get the most similar train user.\n", - " sim_df = self._phase1()\n", + "# def execute_pipeline(self, cold_start: bool = False):\n", + "# # For each test user, get the most similar train user.\n", + "# sim_df = self._phase1()\n", " \n", - " predictions = self._phase2(sim_df, cold_start)" + "# predictions = self._phase2(sim_df, cold_start)" ] }, { @@ -1028,11 +1183,11 @@ "metadata": {}, "outputs": [], "source": [ - "# FULL DATA.\n", - "train = df.loc[df.user_id.isin(TRAIN_USERS), :]\n", - "test = df.loc[df.user_id.isin(TEST_USERS), :]\n", + "# # FULL DATA.\n", + "# train = df.loc[df.user_id.isin(TRAIN_USERS), :]\n", + "# test = df.loc[df.user_id.isin(TEST_USERS), :]\n", "\n", - "train_counts = train.user_id.value_counts()" + "# train_counts = train.user_id.value_counts()" ] }, { @@ -1042,14 +1197,14 @@ "metadata": {}, "outputs": [], "source": [ - "## We only want to train on users who have a good number of trips.\n", - "good_users = train_counts[train_counts >= 100].index\n", + "# ## We only want to train on users who have a good number of trips.\n", + "# good_users = train_counts[train_counts >= 100].index\n", "\n", - "bad_users = train_counts[train_counts < 100].index\n", + "# bad_users = train_counts[train_counts < 100].index\n", "\n", - "print(f\"Number of users filtered out of training: {len(bad_users)}\")\n", + "# print(f\"Number of users filtered out of training: {len(bad_users)}\")\n", "\n", - "filtered_train = train.loc[train.user_id.isin(good_users), :]" + "# filtered_train = train.loc[train.user_id.isin(good_users), :]" ] }, { @@ -1059,10 +1214,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Full data.\n", + "# # Full data.\n", "\n", - "train_df = drop_columns(filtered_train)\n", - "test_df = drop_columns(test)" + "# train_df = drop_columns(filtered_train)\n", + "# test_df = drop_columns(test)" ] }, { @@ -1072,7 +1227,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(train_df.shape, test_df.shape)" + "# print(train_df.shape, test_df.shape)" ] }, { @@ -1084,7 +1239,7 @@ }, "outputs": [], "source": [ - "model_dict = construct_model_dictionary(train_df)" + "# model_dict = construct_model_dictionary(train_df)" ] }, { diff --git a/replacement_mode_modeling/04_FeatureClustering.ipynb b/replacement_mode_modeling/04_FeatureClustering.ipynb index 094d84c..1ee33f6 100644 --- a/replacement_mode_modeling/04_FeatureClustering.ipynb +++ b/replacement_mode_modeling/04_FeatureClustering.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "789df947", "metadata": {}, "outputs": [], @@ -39,16 +39,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "aea4dda7", "metadata": {}, "outputs": [], "source": [ - "DATA_SOURCES = [\n", + "DATA_SOURCE = [\n", " ('./data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),\n", " ('./data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham'),\n", - " ('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv', 'ride2own'),\n", " ('./data/filtered_data/preprocessed_data_openpath_prod_mm_masscec.csv', 'masscec'),\n", + " ('./data/filtered_data/preprocessed_data_openpath_prod_ride2own.csv', 'ride2own'),\n", " ('./data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr')\n", "]\n", "\n", @@ -58,31 +58,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "33ef3275", "metadata": {}, "outputs": [], "source": [ "# Change this name to something unique\n", - "CURRENT_DB = DATA_SOURCES[DB_NUMBER][1]\n", - "PATH = DATA_SOURCES[DB_NUMBER][0]\n", + "PATH = DATA_SOURCE[DB_NUMBER][0]\n", + "CURRENT_DB = DATA_SOURCE[DB_NUMBER][1]\n", "\n", "df = pd.read_csv(PATH)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d0d884a3", + "execution_count": 4, + "id": "d6f69976", "metadata": {}, "outputs": [], "source": [ - "df.target.value_counts()" + "df.dropna(inplace=True)\n", + "\n", + "not_needed = ['deprecatedID', 'data.key']\n", + "\n", + "for col in not_needed:\n", + " if col in df.columns:\n", + " df.drop(columns=[col], inplace=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "b2281bdc", "metadata": {}, "outputs": [], @@ -95,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "9c22d6ac", "metadata": {}, "outputs": [], @@ -107,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "063f6124", "metadata": {}, "outputs": [], @@ -117,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "cef8d45b", "metadata": {}, "outputs": [], @@ -131,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "68c6af2d", "metadata": {}, "outputs": [], @@ -141,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "eff378a7", "metadata": {}, "outputs": [], @@ -151,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "cffbd401", "metadata": {}, "outputs": [], @@ -161,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "f1eb1633", "metadata": {}, "outputs": [], @@ -175,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "d9cc0a0f", "metadata": { "scrolled": true @@ -191,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "750fbd0c", "metadata": {}, "outputs": [], @@ -209,46 +215,270 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "1c3d1849", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
coverage_bicyclingcoverage_carcoverage_transitcoverage_unknowncoverage_walkingpct_distance_bicyclingpct_distance_carpct_distance_transitpct_distance_unknownpct_distance_walkingn_tripsstart:hourend:hour
user_id
0600d3df-c1aa-4ca2-83f2-1f6b8931280d0.0500000.8500000.00.0000000.1000000.0092820.9849540.00.0000000.0057640.0833330.4121180.650288
44eda4da-9223-4bb0-afd4-e7dd19fc6b270.0650410.7804880.00.0650410.0894310.0381800.9126930.00.0331710.0159570.7435900.1498770.149877
4c5436e9-4840-4872-9e8f-5d46ba81fe520.0000000.7142860.00.0000000.2857140.0000000.8472470.00.0000000.1527530.0000000.6502880.650288
7479810c-c602-4508-8ae2-da0bed87558d0.1162790.4534880.00.1279070.3023260.0155300.9264890.00.0270850.0308960.5064100.1498770.990607
7f7c9d3b-84ed-4c14-be8a-aa256daaed010.0175440.7719300.00.0350880.1754390.0051570.8543270.00.1100330.0304830.3205130.9906070.990607
\n", + "
" + ], + "text/plain": [ + " coverage_bicycling coverage_car \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.050000 0.850000 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.065041 0.780488 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 0.714286 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.116279 0.453488 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.017544 0.771930 \n", + "\n", + " coverage_transit coverage_unknown \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.0 0.000000 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.0 0.065041 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.0 0.000000 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.0 0.127907 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.0 0.035088 \n", + "\n", + " coverage_walking \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.100000 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.089431 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.285714 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.302326 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.175439 \n", + "\n", + " pct_distance_bicycling \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.009282 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.038180 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.015530 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.005157 \n", + "\n", + " pct_distance_car pct_distance_transit \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.984954 0.0 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.912693 0.0 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.847247 0.0 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.926489 0.0 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.854327 0.0 \n", + "\n", + " pct_distance_unknown \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.000000 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.033171 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.027085 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.110033 \n", + "\n", + " pct_distance_walking n_trips \\\n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.005764 0.083333 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.015957 0.743590 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.152753 0.000000 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.030896 0.506410 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.030483 0.320513 \n", + "\n", + " start:hour end:hour \n", + "user_id \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.412118 0.650288 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.149877 0.149877 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.650288 0.650288 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.149877 0.990607 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.990607 0.990607 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "figure1_df.head()" ] }, + { + "cell_type": "markdown", + "id": "aa9f5a04", + "metadata": {}, + "source": [ + "### Uncomment the following if you want to find the best eps." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "598d82bc", "metadata": {}, "outputs": [], "source": [ - "epsilons = np.linspace(1e-3, 1., 1000)\n", + "# epsilons = np.linspace(1e-3, 1., 1000)\n", "\n", - "best_eps = -np.inf\n", - "best_score = -np.inf\n", + "# best_eps = -np.inf\n", + "# best_score = -np.inf\n", "\n", - "for eps in epsilons:\n", - " model = DBSCAN(eps=eps).fit(figure1_df)\n", + "# for eps in epsilons:\n", + "# model = DBSCAN(eps=eps).fit(figure1_df)\n", " \n", - " if len(np.unique(model.labels_)) < 2:\n", - " continue\n", + "# if len(np.unique(model.labels_)) < 2:\n", + "# continue\n", " \n", - " score = silhouette_score(figure1_df, model.labels_)\n", - " if score > best_score:\n", - " best_eps = eps\n", - " best_score = score\n", + "# score = silhouette_score(figure1_df, model.labels_)\n", + "# if score > best_score:\n", + "# best_eps = eps\n", + "# best_score = score\n", "\n", - "print(best_eps)" + "# print(best_eps)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "bc89a42d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counter({0: 8, -1: 4})\n" + ] + } + ], "source": [ "'''\n", "AlLCEO: eps=0.542\n", @@ -263,12 +493,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "05c9a7c4", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 users in cluster -1\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8 users in cluster 0\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# After clustering, we would like to see what the replaced mode argmax distribution in each cluster is.\n", "\n", @@ -297,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "f2e8e117", "metadata": {}, "outputs": [], @@ -314,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "99369dba", "metadata": {}, "outputs": [], @@ -324,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "6cca3671", "metadata": {}, "outputs": [], @@ -345,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "18093734", "metadata": {}, "outputs": [], @@ -358,10 +623,200 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "8001a140", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pct_trips_unknownpct_trips_cardistance_unknowndistance_carduration_carduration_unknown
0600d3df-c1aa-4ca2-83f2-1f6b8931280d1.00.00.1669780.00.00.031289
44eda4da-9223-4bb0-afd4-e7dd19fc6b271.00.00.3763080.00.00.362037
4c5436e9-4840-4872-9e8f-5d46ba81fe521.00.00.0000000.00.00.000000
7479810c-c602-4508-8ae2-da0bed87558d1.00.00.8021320.00.00.447344
7f7c9d3b-84ed-4c14-be8a-aa256daaed011.00.00.2093200.00.00.172709
892088f9-4a27-4f39-91fb-0f5e48d189821.00.00.9825190.00.00.705049
993af3be-5011-44ad-b9cd-d4df7f0e67ad1.00.00.6593890.00.01.000000
c8158323-957d-43c7-bde6-193b99ee72b51.00.00.1004480.00.00.030035
cbed6b7b-555d-43a0-aadc-4a42540a024e1.00.00.3736100.00.00.228214
de83c290-7708-4f8b-8ca3-656072164ef60.01.00.5359491.01.00.700681
f3b93934-09ca-4b90-9089-51b5777bb9e71.00.01.0000000.00.00.740508
f8260067-8ba9-44ea-9c39-cd3e1bd003dd1.00.00.2326130.00.00.250613
\n", + "
" + ], + "text/plain": [ + " pct_trips_unknown pct_trips_car \\\n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 1.0 0.0 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 1.0 0.0 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 1.0 0.0 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 1.0 0.0 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 1.0 0.0 \n", + "892088f9-4a27-4f39-91fb-0f5e48d18982 1.0 0.0 \n", + "993af3be-5011-44ad-b9cd-d4df7f0e67ad 1.0 0.0 \n", + "c8158323-957d-43c7-bde6-193b99ee72b5 1.0 0.0 \n", + "cbed6b7b-555d-43a0-aadc-4a42540a024e 1.0 0.0 \n", + "de83c290-7708-4f8b-8ca3-656072164ef6 0.0 1.0 \n", + "f3b93934-09ca-4b90-9089-51b5777bb9e7 1.0 0.0 \n", + "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 1.0 0.0 \n", + "\n", + " distance_unknown distance_car \\\n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.166978 0.0 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.376308 0.0 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 0.0 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.802132 0.0 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.209320 0.0 \n", + "892088f9-4a27-4f39-91fb-0f5e48d18982 0.982519 0.0 \n", + "993af3be-5011-44ad-b9cd-d4df7f0e67ad 0.659389 0.0 \n", + "c8158323-957d-43c7-bde6-193b99ee72b5 0.100448 0.0 \n", + "cbed6b7b-555d-43a0-aadc-4a42540a024e 0.373610 0.0 \n", + "de83c290-7708-4f8b-8ca3-656072164ef6 0.535949 1.0 \n", + "f3b93934-09ca-4b90-9089-51b5777bb9e7 1.000000 0.0 \n", + "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 0.232613 0.0 \n", + "\n", + " duration_car duration_unknown \n", + "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.0 0.031289 \n", + "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.0 0.362037 \n", + "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.0 0.000000 \n", + "7479810c-c602-4508-8ae2-da0bed87558d 0.0 0.447344 \n", + "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.0 0.172709 \n", + "892088f9-4a27-4f39-91fb-0f5e48d18982 0.0 0.705049 \n", + "993af3be-5011-44ad-b9cd-d4df7f0e67ad 0.0 1.000000 \n", + "c8158323-957d-43c7-bde6-193b99ee72b5 0.0 0.030035 \n", + "cbed6b7b-555d-43a0-aadc-4a42540a024e 0.0 0.228214 \n", + "de83c290-7708-4f8b-8ca3-656072164ef6 1.0 0.700681 \n", + "f3b93934-09ca-4b90-9089-51b5777bb9e7 0.0 0.740508 \n", + "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 0.0 0.250613 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "target_df = user_target_pct.merge(right=target_distance, left_index=True, right_index=True).merge(\n", " right=target_duration, left_index=True, right_index=True\n", @@ -378,47 +833,66 @@ "display(target_df)" ] }, + { + "cell_type": "markdown", + "id": "eba4f246", + "metadata": {}, + "source": [ + "### Uncomment if you want to find the best eps" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "31fecc00", "metadata": {}, "outputs": [], "source": [ - "epsilons = np.linspace(5e-3, 1., 1500)\n", - "best_score = -np.inf\n", - "best_eps = None\n", - "best_n = None\n", - "# alpha = 0.7\n", - "beta = 0.05\n", - "\n", - "for eps in epsilons:\n", - " for n in range(2, 30):\n", - " labels = DBSCAN(eps=eps, min_samples=n).fit(target_df).labels_\n", + "# epsilons = np.linspace(5e-3, 1., 1500)\n", + "# best_score = -np.inf\n", + "# best_eps = None\n", + "# best_n = None\n", + "# # alpha = 0.7\n", + "# beta = 0.05\n", + "\n", + "# for eps in epsilons:\n", + "# for n in range(2, 30):\n", + "# labels = DBSCAN(eps=eps, min_samples=n).fit(target_df).labels_\n", " \n", - " n_unique = np.unique(labels)\n", - " n_outliers = len(labels[labels == -1])\n", + "# n_unique = np.unique(labels)\n", + "# n_outliers = len(labels[labels == -1])\n", " \n", - " if n_outliers == len(labels) or len(n_unique) < 2:\n", - " continue\n", + "# if n_outliers == len(labels) or len(n_unique) < 2:\n", + "# continue\n", " \n", - " # Encourage more clustering and discourage more outliers.\n", - " score = silhouette_score(target_df, labels) + (len(labels) - n_outliers)/n_outliers\n", + "# # Encourage more clustering and discourage more outliers.\n", + "# score = silhouette_score(target_df, labels) + (len(labels) - n_outliers)/n_outliers\n", " \n", - " if score > best_score:\n", - " best_score = score\n", - " best_eps = eps\n", - " best_n = n\n", + "# if score > best_score:\n", + "# best_score = score\n", + "# best_eps = eps\n", + "# best_n = n\n", "\n", - "print(f\"{best_score=}, {best_n=}, {best_eps=}\")" + "# print(f\"{best_score=}, {best_n=}, {best_eps=}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "e39b41ba", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({0: 11, -1: 1})" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# 0.35 is a good value\n", "\n", @@ -428,7 +902,7 @@ "masscec: min_samples=2, eps=0.986724482988659\n", "'''\n", "\n", - "cl2 = DBSCAN(eps=best_eps, min_samples=2).fit(target_df)\n", + "cl2 = DBSCAN(eps=0.6, min_samples=2).fit(target_df)\n", "# cl2 = KMeans(n_clusters=5).fit(target_df)\n", "\n", "Counter(cl2.labels_)" @@ -436,10 +910,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "1dbf8763", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sklearn.decomposition import PCA\n", "\n", @@ -454,17 +939,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "1e444316", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['duration', 'distance', 'start:hour', 'end:hour', 'user_id', 'target', 'section_mode_argmax', 'section_distance_argmax', 'section_duration_argmax', 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', 'income_category', 'available_modes', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', 'highest_education_high_school_graduate_or_ged', 'highest_education_prefer_not_to_say', 'highest_education_some_college_or_associates_degree', 'primary_job_description_Clerical or administrative support', 'primary_job_description_Other', 'gender_man', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', 'av_ridehail', 'av_p_micro', 'av_walk', 'av_transit', 'av_car', 'av_s_micro', 'av_s_car', 'av_unknown', 'av_no_trip', 'cost_ridehail', 'cost_p_micro', 'cost_walk', 'cost_transit', 'cost_car', 'cost_s_micro', 'cost_s_car', 'cost_unknown', 'cost_no_trip', 'mph']\n" + ] + } + ], "source": [ "print(df.columns.tolist())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "f0bc09b9", "metadata": {}, "outputs": [], @@ -479,57 +972,119 @@ "\n", "\n", "demographic_cols = {\n", - " 'Stage_database': [\n", - " 'has_drivers_license', 'is_student', 'is_paid', \n", - " 'income_category', 'n_residence_members', 'n_residents_u18', 'n_residents_with_license', \n", - " 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', \n", - " 'n_working_residents', \"highest_education_Bachelor's degree\", \n", - " 'highest_education_Graduate degree or professional degree', \n", - " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", - " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", - " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", - " 'primary_job_description_Education', 'primary_job_description_Food service', \n", - " 'primary_job_description_Linecook', \n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", - " 'primary_job_description_Medical/healthcare', 'primary_job_description_Non-profit program manager', \n", - " 'primary_job_description_Other', 'primary_job_description_Professional, managerial, or technical', \n", - " 'primary_job_description_Sales or service', 'primary_job_description_Self employed', \n", - " 'primary_job_description_food service', 'gender_Man', 'gender_Nonbinary/genderqueer/genderfluid', \n", - " 'gender_Prefer not to say', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", - " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail', 'av_unknown', 'av_walk', 'av_car', \n", - " 'av_s_car'\n", - " ] + [c for c in df.columns if 'age' in c],\n", - " 'durham': [\n", - " 'is_student', 'is_paid', 'has_drivers_license', \n", - " 'n_residents_u18', 'n_residence_members', 'income_category',\n", - " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', \n", - " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", - " 'highest_education_graduate_degree_or_professional_degree', \n", - " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate', \n", - " 'highest_education_some_college_or_associates_degree', \n", - " 'primary_job_description_Clerical or administrative support', \n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", - " 'primary_job_description_Other', 'primary_job_description_Professional, Manegerial, or Technical', \n", - " 'primary_job_description_Sales or service', 'gender_man', \n", - " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman', \n", - " 'av_walk', 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car', 'av_ridehail', \n", - " 'av_s_micro', 'av_s_car'\n", - " ] + [c for c in df.columns if 'age' in c],\n", - " 'masscec': [\n", - " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", - " 'income_category', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', \n", - " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", - " 'highest_education_graduate_degree_or_professional_degree', \n", - " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate', \n", - " 'highest_education_prefer_not_to_say', 'highest_education_some_college_or_associates_degree', \n", - " 'primary_job_description_Clerical or administrative support', \n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", - " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', \n", - " 'primary_job_description_Professional, Manegerial, or Technical', \n", - " 'primary_job_description_Sales or service', 'gender_man', 'gender_prefer_not_to_say', 'gender_woman', \n", - " 'av_p_micro', 'av_s_car', 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown', \n", - " 'av_ridehail', 'av_walk'\n", - " ] + [c for c in df.columns if 'age' in c],\n", + " 'allceo': [\n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category',\n", + " 'n_residence_members', 'n_residents_u18', 'n_residents_with_license',\n", + " 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs', 'n_working_residents',\n", + " \"highest_education_Bachelor's degree\",\n", + " 'highest_education_Graduate degree or professional degree',\n", + " 'highest_education_High school graduate or GED',\n", + " 'highest_education_Less than a high school graduate',\n", + " 'highest_education_Prefer not to say',\n", + " 'highest_education_Some college or associates degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Custodial',\n", + " 'primary_job_description_Education',\n", + " 'primary_job_description_Food service',\n", + " 'primary_job_description_Linecook',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Medical/healthcare',\n", + " 'primary_job_description_Non-profit program manager',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, managerial, or technical',\n", + " 'primary_job_description_Sales or service',\n", + " 'primary_job_description_Self employed',\n", + " 'primary_job_description_food service', 'gender_Man',\n", + " 'gender_Nonbinary/genderqueer/genderfluid', 'gender_Prefer not to say',\n", + " 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", + " 'age_16___20_years_old', 'age_21___25_years_old',\n", + " 'age_26___30_years_old', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_46___50_years_old', 'age_51___55_years_old',\n", + " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old',\n", + " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail',\n", + " 'av_unknown', 'av_walk', 'av_car', 'av_s_car'\n", + " ],\n", + " 'durham': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18',\n", + " 'n_residence_members', 'income_category',\n", + " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles',\n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs',\n", + " 'highest_education_bachelor_s_degree',\n", + " 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man',\n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman',\n", + " 'age_16___20_years_old', 'age_21___25_years_old',\n", + " 'age_26___30_years_old', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'av_walk',\n", + " 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car',\n", + " 'av_ridehail', 'av_s_micro', 'av_s_car'\n", + " ],\n", + " 'nicr': [\n", + " 'is_student', 'is_paid',\n", + " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", + " 'income_category', 'n_residents_with_license',\n", + " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_prefer_not_to_say', 'primary_job_description_Other',\n", + " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'av_p_micro',\n", + " 'av_car', 'av_transit', 'av_ridehail', 'av_no_trip', 'av_s_car',\n", + " 'av_s_micro', 'av_unknown', 'av_walk'\n", + " ],\n", + " 'masscec': [\n", + " 'is_student', 'is_paid',\n", + " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", + " 'income_category', 'n_residents_with_license',\n", + " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree',\n", + " 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_prefer_not_to_say',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Prefer not to say',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man',\n", + " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old',\n", + " 'age_21___25_years_old', 'age_26___30_years_old',\n", + " 'age_31___35_years_old', 'age_36___40_years_old',\n", + " 'age_41___45_years_old', 'age_46___50_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old',\n", + " 'age_61___65_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car',\n", + " 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown',\n", + " 'av_ridehail', 'av_walk'\n", + " ],\n", + " 'ride2own': [\n", + " 'has_drivers_license', 'is_student',\n", + " 'is_paid', 'income_category', 'n_residence_members',\n", + " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license',\n", + " 'n_motor_vehicles', 'has_medical_condition',\n", + " 'ft_job', 'multiple_jobs',\n", + " 'highest_education_bachelor_s_degree',\n", + " 'highest_education_high_school_graduate_or_ged',\n", + " 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'gender_man', 'gender_woman', 'age_31___35_years_old',\n", + " 'age_36___40_years_old', 'age_41___45_years_old',\n", + " 'age_51___55_years_old', 'av_no_trip', 'av_s_micro', 'av_transit',\n", + " 'av_car', 'av_ridehail', 'av_p_micro', 'av_s_car', 'av_walk',\n", + " 'av_unknown'\n", + " ]\n", "}\n", "\n", "\n", @@ -540,12 +1095,191 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "5a3c6355", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For cluster -1:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurech
0is_student-0.0
24av_s_micro-0.0
23av_s_car-0.0
22av_no_trip-0.0
\n", + "
" + ], + "text/plain": [ + " feature ch\n", + "0 is_student -0.0\n", + "24 av_s_micro -0.0\n", + "23 av_s_car -0.0\n", + "22 av_no_trip -0.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "For cluster 0:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurech
0is_student-0.0
23av_s_car-0.0
22av_no_trip-0.0
21av_ridehail-0.0
\n", + "
" + ], + "text/plain": [ + " feature ch\n", + "0 is_student -0.0\n", + "23 av_s_car -0.0\n", + "22 av_no_trip -0.0\n", + "21 av_ridehail -0.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "### DEMOGRAPHICS\n", "\n", @@ -620,7 +1354,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "580bbd86", "metadata": {}, "outputs": [], @@ -629,10 +1363,10 @@ "\n", "def get_trip_summary_df(users, df):\n", " '''\n", - " 1. df = a huge dataframe of user-trips. Each row is a trip.\n", - " 2. every trip is divided into sections: [walk, transit, walk]\n", - " 3. Each section has a corresponding distance and duration: [m1, m2, m3], [t1, t2, t3], [d1, d2, d3]\n", - " 4. What we are doing is only considering the mode, distance, and duration of the section with the largest distance\n", + " Group the trips by user ID and argmax_mode and compute trip summaries. Additional\n", + " statistics that could be incorporated: IQR.\n", + " \n", + " mode_coverage computes trips summaries for the sections with the most-traveled distance.\n", " '''\n", " \n", " costs = [c for c in df.columns if 'av_' in c]\n", @@ -654,12 +1388,208 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "92ad2485", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For cluster -1:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_35596/1105737326.py:49: RuntimeWarning: Mean of empty slice\n", + " out_cluster_homogeneity[cix][feature] = np.nanmean([in_cluster_homogeneity[x].get(feature, np.nan) for x in oix])\n" + ] + }, + { + "data": { + "text/plain": [ + "unknown 0.986577\n", + "car 0.013423\n", + "Name: target, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurech
0section_duration_argmax_mean_bicycling0.0
25duration_median0.0
24duration_mean0.0
23mph_median_walking0.0
\n", + "
" + ], + "text/plain": [ + " feature ch\n", + "0 section_duration_argmax_mean_bicycling 0.0\n", + "25 duration_median 0.0\n", + "24 duration_mean 0.0\n", + "23 mph_median_walking 0.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n", + "For cluster 0:\n" + ] + }, + { + "data": { + "text/plain": [ + "unknown 1.0\n", + "Name: target, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurech
31duration_median250175.825182
1section_duration_argmax_mean_car262552.009065
11section_distance_argmax_mean_car263103.437553
24mph_mean_walking264091.869648
\n", + "
" + ], + "text/plain": [ + " feature ch\n", + "31 duration_median 250175.825182\n", + "1 section_duration_argmax_mean_car 262552.009065\n", + "11 section_distance_argmax_mean_car 263103.437553\n", + "24 mph_mean_walking 264091.869648" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n" + ] + } + ], "source": [ "## TRIP SUMMARIES\n", "\n", @@ -749,12 +1679,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "a8723e3d", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For cluster -1:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_35596/2042025115.py:34: RuntimeWarning: Mean of empty slice\n", + " oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n", + "For cluster 0:\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n" + ] + } + ], "source": [ "ic, oc = dict(), dict()\n", "\n", @@ -869,210 +1850,210 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "d0288db8", "metadata": {}, "outputs": [], "source": [ - "from sklearn.cluster import AffinityPropagation\n", + "# from sklearn.cluster import AffinityPropagation\n", "\n", - "best_score = -np.inf\n", - "best_params = None" + "# best_score = -np.inf\n", + "# best_params = None" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "1b14ad0c", "metadata": {}, "outputs": [], "source": [ - "cls = AffinityPropagation(random_state=13210).fit(target_df)\n", - "labels = cls.labels_\n", + "# cls = AffinityPropagation(random_state=13210).fit(target_df)\n", + "# labels = cls.labels_\n", "\n", - "print(labels)" + "# print(labels)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "2562bbb6-66eb-4283-8c08-6e20a0b2ade5", "metadata": {}, "outputs": [], "source": [ - "center_embeddings = cls.cluster_centers_\n", - "centers_proj = PCA(n_components=2).fit_transform(center_embeddings)" + "# center_embeddings = cls.cluster_centers_\n", + "# centers_proj = PCA(n_components=2).fit_transform(center_embeddings)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "c7aad38a", "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots()\n", - "sns.scatterplot(x=tsfm[:,0], y=tsfm[:,1], c=cls.labels_, ax=ax)\n", - "ax.scatter(x=centers_proj[:,0], y=centers_proj[:,1], marker='X', c='red', alpha=0.5)\n", - "ax.set(xlabel='Latent Dim 0', ylabel='Latent Dim 1')\n", - "# plt.legend([str(x) for x in ap_labels], loc='best')\n", - "plt.show()" + "# fig, ax = plt.subplots()\n", + "# sns.scatterplot(x=tsfm[:,0], y=tsfm[:,1], c=cls.labels_, ax=ax)\n", + "# ax.scatter(x=centers_proj[:,0], y=centers_proj[:,1], marker='X', c='red', alpha=0.5)\n", + "# ax.set(xlabel='Latent Dim 0', ylabel='Latent Dim 1')\n", + "# # plt.legend([str(x) for x in ap_labels], loc='best')\n", + "# plt.show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "39ce0238-b3f2-4f46-a52f-13e3160cc52f", "metadata": {}, "outputs": [], "source": [ - "def get_data2(cix, labels):\n", - " users = target_df.iloc[labels == cix, :].index\n", + "# def get_data2(cix, labels):\n", + "# users = target_df.iloc[labels == cix, :].index\n", " \n", - " # Compute trip summaries.\n", - " X = df.loc[df.user_id.isin(users), [\n", - " 'section_distance_argmax', 'section_duration_argmax',\n", - " 'section_mode_argmax', 'distance',\n", - " 'duration', 'mph', 'user_id', 'target'\n", - " ]]\n", + "# # Compute trip summaries.\n", + "# X = df.loc[df.user_id.isin(users), [\n", + "# 'section_distance_argmax', 'section_duration_argmax',\n", + "# 'section_mode_argmax', 'distance',\n", + "# 'duration', 'mph', 'user_id', 'target'\n", + "# ]]\n", " \n", - " # Compute the target distribution and select the argmax.\n", - " target_distribution = X.target.value_counts(ascending=False, normalize=True)\n", - " target_distribution.rename(index=MAP, inplace=True)\n", + "# # Compute the target distribution and select the argmax.\n", + "# target_distribution = X.target.value_counts(ascending=False, normalize=True)\n", + "# target_distribution.rename(index=MAP, inplace=True)\n", " \n", - " # Caution - this summary df has NaNs. Use nanstd() to compute nan-aware std.\n", - " subset = get_trip_summary_df(users, X)\n", + "# # Caution - this summary df has NaNs. Use nanstd() to compute nan-aware std.\n", + "# subset = get_trip_summary_df(users, X)\n", " \n", - " norm_subset = pd.DataFrame(\n", - " MinMaxScaler().fit_transform(subset),\n", - " columns=subset.columns, index=subset.index\n", - " )\n", + "# norm_subset = pd.DataFrame(\n", + "# MinMaxScaler().fit_transform(subset),\n", + "# columns=subset.columns, index=subset.index\n", + "# )\n", " \n", - " return norm_subset, target_distribution" + "# return norm_subset, target_distribution" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "ec27cf29", "metadata": { "scrolled": false }, "outputs": [], "source": [ - "## Analaysis for this data.\n", + "# ## Analaysis for this data.\n", "\n", - "ic, oc = dict(), dict()\n", - "labels = cls.labels_\n", + "# ic, oc = dict(), dict()\n", + "# labels = cls.labels_\n", "\n", - "for cix in np.unique(labels):\n", - " users = target_df[labels == cix].index\n", + "# for cix in np.unique(labels):\n", + "# users = target_df[labels == cix].index\n", " \n", - " ic[cix] = dict()\n", + "# ic[cix] = dict()\n", " \n", - " # Trip characteristics.\n", - " norm_subset, _ = get_data2(cix, labels)\n", - " for feature in norm_subset.columns:\n", - " ic[cix][feature] = np.nanstd(norm_subset[feature])\n", + "# # Trip characteristics.\n", + "# norm_subset, _ = get_data2(cix, labels)\n", + "# for feature in norm_subset.columns:\n", + "# ic[cix][feature] = np.nanstd(norm_subset[feature])\n", " \n", - " # Demographics.\n", - " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", - " processed = preprocess_demo_data(data)\n", + "# # Demographics.\n", + "# data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + "# processed = preprocess_demo_data(data)\n", " \n", - " for col in processed.columns:\n", - " # Numeric/ordinal values. Use std. to measure homogeneity.\n", - " if col == 'age' or col == 'income_category' or col == 'n_working_residents':\n", - " ic[cix][col] = np.nanstd(processed[col])\n", - " else:\n", - " ic[cix][col] = entropy(processed[col])\n", - "\n", - "for cix in ic.keys():\n", - " oc[cix] = dict()\n", - " oix = set(labels) - set([cix])\n", - " for feature in ic[cix].keys():\n", - " oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n", + "# for col in processed.columns:\n", + "# # Numeric/ordinal values. Use std. to measure homogeneity.\n", + "# if col == 'age' or col == 'income_category' or col == 'n_working_residents':\n", + "# ic[cix][col] = np.nanstd(processed[col])\n", + "# else:\n", + "# ic[cix][col] = entropy(processed[col])\n", "\n", - "# # Now, compute the per-cluster homogeneity.\n", "# for cix in ic.keys():\n", + "# oc[cix] = dict()\n", + "# oix = set(labels) - set([cix])\n", + "# for feature in ic[cix].keys():\n", + "# oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n", + "\n", + "# # # Now, compute the per-cluster homogeneity.\n", + "# # for cix in ic.keys():\n", " \n", - "# users = users = target_df[labels == cix].index\n", - "# norm_subset, target_dist = get_data(cix, labels)\n", - "# data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", - "# processed = preprocess_demo_data(data)\n", + "# # users = users = target_df[labels == cix].index\n", + "# # norm_subset, target_dist = get_data(cix, labels)\n", + "# # data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + "# # processed = preprocess_demo_data(data)\n", " \n", - "# concat = processed.merge(norm_subset, left_index=True, right_index=True)\n", + "# # concat = processed.merge(norm_subset, left_index=True, right_index=True)\n", " \n", - "# ch = list()\n", - "# for feature in ic[cix].keys():\n", - "# ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", - "# ch.append([feature, ratio])\n", + "# # ch = list()\n", + "# # for feature in ic[cix].keys():\n", + "# # ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", + "# # ch.append([feature, ratio])\n", " \n", - "# ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).head(TOP_K).reset_index(drop=True)\n", + "# # ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).head(TOP_K).reset_index(drop=True)\n", "\n", "\n", - "# Now, compute the per-cluster homogeneity.\n", - "ax_ix = 0\n", - "for cix in ic.keys():\n", + "# # Now, compute the per-cluster homogeneity.\n", + "# ax_ix = 0\n", + "# for cix in ic.keys():\n", "\n", - " print(f\"For cluster {cix}:\")\n", + "# print(f\"For cluster {cix}:\")\n", "\n", - " # For each, cluster, we will have (TOP_K x n_clusters) figures.\n", - " fig, ax = plt.subplots(nrows=5, ncols=len(ic.keys()), figsize=(12, 8))\n", + "# # For each, cluster, we will have (TOP_K x n_clusters) figures.\n", + "# fig, ax = plt.subplots(nrows=5, ncols=len(ic.keys()), figsize=(12, 8))\n", "\n", - " other_ix = set(ic.keys()) - set([cix])\n", + "# other_ix = set(ic.keys()) - set([cix])\n", " \n", - " ch = list()\n", - " for feature in ic[cix].keys():\n", - " ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", - " ch.append([feature, ratio])\n", + "# ch = list()\n", + "# for feature in ic[cix].keys():\n", + "# ratio = ic[cix][feature] / (oc[cix][feature] + 1e-6)\n", + "# ch.append([feature, ratio])\n", " \n", - " # Just the top k.\n", - " ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).reset_index(drop=True).head(5)\n", - " figure_data = dict()\n", + "# # Just the top k.\n", + "# ch_df = pd.DataFrame(ch, columns=['feature', 'ch']).sort_values(by=['ch']).reset_index(drop=True).head(5)\n", + "# figure_data = dict()\n", " \n", - " # Get the actual trip summary data.\n", - " trip_summary_data, target_dist = get_data(cix)\n", + "# # Get the actual trip summary data.\n", + "# trip_summary_data, target_dist = get_data(cix)\n", "\n", - " display(target_dist)\n", + "# display(target_dist)\n", " \n", - " # Get the actual demographic data.\n", - " users = target_df[labels == cix].index\n", - " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", - " processed = preprocess_demo_data(data)\n", - "\n", - " # Left-most subplot will be that of the current cluster's feature.\n", - " for row_ix, row in ch_df.iterrows():\n", - " if row.feature in trip_summary_data.columns:\n", - " sns.histplot(trip_summary_data[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", - " else:\n", - " sns.histplot(processed[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", - " ax[row_ix][0].set_xlabel(ax[row_ix][0].get_xlabel(), fontsize=6)\n", - " ax[row_ix][0].set_ylim(0., 100.)\n", + "# # Get the actual demographic data.\n", + "# users = target_df[labels == cix].index\n", + "# data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + "# processed = preprocess_demo_data(data)\n", "\n", - " offset_col_ix = 1\n", - " ## Now, others.\n", - " for oix in other_ix:\n", - " # Get the actual trip summary data.\n", - " other_summary_data, _ = get_data(oix)\n", + "# # Left-most subplot will be that of the current cluster's feature.\n", + "# for row_ix, row in ch_df.iterrows():\n", + "# if row.feature in trip_summary_data.columns:\n", + "# sns.histplot(trip_summary_data[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + "# else:\n", + "# sns.histplot(processed[row.feature], ax=ax[row_ix][0], stat='percent').set_title(\"Current cluster\")\n", + "# ax[row_ix][0].set_xlabel(ax[row_ix][0].get_xlabel(), fontsize=6)\n", + "# ax[row_ix][0].set_ylim(0., 100.)\n", + "\n", + "# offset_col_ix = 1\n", + "# ## Now, others.\n", + "# for oix in other_ix:\n", + "# # Get the actual trip summary data.\n", + "# other_summary_data, _ = get_data(oix)\n", " \n", - " # Get the actual demographic data.\n", - " users = target_df[labels == oix].index\n", - " data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", - " other_demo = preprocess_demo_data(data)\n", - "\n", - " for row_ix, row in ch_df.iterrows():\n", - " if row.feature in other_summary_data.columns:\n", - " sns.histplot(other_summary_data[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", - " else:\n", - " sns.histplot(other_demo[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", - " ax[row_ix][offset_col_ix].set_xlabel(ax[row_ix][offset_col_ix].get_xlabel(), fontsize=6)\n", - " ax[row_ix][offset_col_ix].set_ylim(0., 100.)\n", + "# # Get the actual demographic data.\n", + "# users = target_df[labels == oix].index\n", + "# data = demographics.loc[demographics.index.isin(users), :].reset_index(drop=True)\n", + "# other_demo = preprocess_demo_data(data)\n", + "\n", + "# for row_ix, row in ch_df.iterrows():\n", + "# if row.feature in other_summary_data.columns:\n", + "# sns.histplot(other_summary_data[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + "# else:\n", + "# sns.histplot(other_demo[row.feature], ax=ax[row_ix][offset_col_ix], stat='percent').set_title(f\"Cluster {oix}\")\n", + "# ax[row_ix][offset_col_ix].set_xlabel(ax[row_ix][offset_col_ix].get_xlabel(), fontsize=6)\n", + "# ax[row_ix][offset_col_ix].set_ylim(0., 100.)\n", " \n", - " offset_col_ix += 1\n", + "# offset_col_ix += 1\n", "\n", - " plt.tight_layout()\n", - " plt.show()\n", - " print(50 * '=')" + "# plt.tight_layout()\n", + "# plt.show()\n", + "# print(50 * '=')" ] }, { From ad2321670e9bc03382ed77bde4ef3908cae8b162 Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Wed, 1 May 2024 14:58:59 -0400 Subject: [PATCH 5/6] Fixed transit bug in 02_run_trip_models.py; added biogeme notebook --- .../01_extract_db_data.ipynb | 96 +- .../02_run_trip_level_models.py | 33 +- .../03_user_level_models.ipynb | 176 +-- .../04_FeatureClustering.ipynb | 1205 ++--------------- .../05_biogeme_modeling.ipynb | 929 +++++++++++++ 5 files changed, 1229 insertions(+), 1210 deletions(-) create mode 100644 replacement_mode_modeling/05_biogeme_modeling.ipynb diff --git a/replacement_mode_modeling/01_extract_db_data.ipynb b/replacement_mode_modeling/01_extract_db_data.ipynb index bef2545..eea9d64 100644 --- a/replacement_mode_modeling/01_extract_db_data.ipynb +++ b/replacement_mode_modeling/01_extract_db_data.ipynb @@ -45,7 +45,7 @@ "source": [ "# Add path to your emission server here. Uncommented because the notebooks are run in the server.\n", "# If running locally, you need to point this to the e-mission server repo.\n", - "# emission_path = Path(os.getcwd()).parent.parent.parent / 'my_emission_server' / 'e-mission-server'\n", + "# emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'\n", "# sys.path.append(str(emission_path))\n", "\n", "# # Also add the home (viz_scripts) to the path\n", @@ -63,34 +63,6 @@ "import emission.storage.timeseries.abstract_timeseries as esta" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e171e277", - "metadata": {}, - "outputs": [], - "source": [ - "DB_SOURCE = [\n", - " \"Stage_database\", # Does NOT have composite trips BUT has section modes and distances\n", - " \"openpath_prod_durham\", # Has composite trips\n", - " \"openpath_prod_mm_masscec\", # Has composite trips\n", - " \"openpath_prod_ride2own\", # Has composite trips\n", - " \"openpath_prod_uprm_nicr\" # Has composite trips\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70fa3112", - "metadata": {}, - "outputs": [], - "source": [ - "CURRENT_DB = DB_SOURCE[0]\n", - "\n", - "assert CURRENT_DB in DB_SOURCE" - ] - }, { "cell_type": "code", "execution_count": null, @@ -369,6 +341,34 @@ "}" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e171e277", + "metadata": {}, + "outputs": [], + "source": [ + "DB_SOURCE = [\n", + " \"Stage_database\", # Does NOT have composite trips BUT has section modes and distances\n", + " \"openpath_prod_durham\", # Has composite trips\n", + " \"openpath_prod_mm_masscec\", # Has composite trips\n", + " \"openpath_prod_ride2own\", # Has composite trips\n", + " \"openpath_prod_uprm_nicr\" # Has composite trips\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70fa3112", + "metadata": {}, + "outputs": [], + "source": [ + "CURRENT_DB = DB_SOURCE[4]\n", + "\n", + "assert CURRENT_DB in DB_SOURCE" + ] + }, { "cell_type": "code", "execution_count": null, @@ -693,11 +693,13 @@ "survey_data['ft_job'] = survey_data.primary_job_type.apply(\n", " lambda x: 1 if str(x).lower() == 'full_time' else 0\n", ")\n", + "survey_data.loc[~survey_data.ft_job.isin([0, 1]), 'ft_job'] = 0\n", "\n", "# gtg\n", "survey_data['multiple_jobs'] = survey_data.has_multiple_jobs.apply(\n", " lambda x: 1 if str(x).lower() == 'yes' else 0\n", ")\n", + "survey_data.loc[~survey_data.multiple_jobs.isin([0, 1]), 'multiple_jobs'] = 0\n", "\n", "# gtg\n", "survey_data.loc[\n", @@ -714,6 +716,7 @@ "survey_data.has_drivers_license = survey_data.has_drivers_license.apply(\n", " lambda x: 1 if str(x).lower() == 'yes' else 0\n", ")\n", + "survey_data.loc[~survey_data.has_drivers_license.isin([0, 1]), 'has_drivers_license'] = 0\n", "\n", "survey_data.loc[survey_data.n_residents_u18 == 'prefer_not_to_say', 'n_residents_u18'] = 0\n", "survey_data.n_residents_u18 = survey_data.n_residents_u18.astype(int)\n", @@ -743,11 +746,14 @@ "\n", "# gtg\n", "survey_data.is_paid = survey_data.is_paid.apply(lambda x: 1 if x == 'Yes' else 0)\n", + "survey_data.loc[~survey_data.is_paid.isin([0, 1]), 'is_paid'] = 0\n", "\n", "# gtg\n", "survey_data.has_medical_condition = survey_data.has_medical_condition.apply(\n", " lambda x: 1 if str(x).lower() == 'yes' else 0\n", ")\n", + "survey_data.loc[~survey_data.has_medical_condition.isin([0, 1]), 'has_medical_condition'] = 0\n", + "\n", "\n", "## gtg\n", "survey_data.is_student.replace({\n", @@ -761,7 +767,10 @@ " 'Yes - Part-Time College/University': 1,\n", " 'Taking prerequisites missing for grad program ': 1, \n", " 'Graduate': 1,\n", + " 'Fire Fighter 2 Training': 0,\n", + " 'By hours ': 0,\n", " 'Custodian': 0, \n", + " 'taking classes toward early childhood licensure': 0,\n", " 'Work at csu': 0,\n", " 'not_a_student': 0, \n", " 'yes___vocation_technical_trade_school': 1,\n", @@ -769,7 +778,9 @@ " 'prefer_not_to_say': 0, \n", " 'yes___k_12th_grade_including_ged': 1,\n", " 'yes___full_time_college_university': 1\n", - "}, inplace=True)" + "}, inplace=True)\n", + "\n", + "survey_data.loc[~survey_data.is_student.isin([0, 1]), 'is_student'] = 0" ] }, { @@ -798,7 +809,8 @@ " survey_data['age'] = new_col\n", " \n", " survey_data.loc[survey_data.age.isin([\n", - " '66___70_years_old', '76___80_years_old', '81___85_years_old'\n", + " '66___70_years_old', '71___75_years_old', '76___80_years_old', '81___85_years_old',\n", + " '151___155_years_old', \n", " ]), 'age'] = '__65_years_old'\n", " \n", " survey_data.drop(columns=['birth_year'], inplace=True)\n", @@ -894,7 +906,14 @@ " ]), 'primary_job_description'\n", " ] = 'Manufacturing, construction, maintenance, or farming'\n", "\n", - " df.loc[df.primary_job_description.isna(), 'primary_job_description'] = 'Other'\n", + " # All others in Other\n", + " df.loc[\n", + " (df.primary_job_description.isna()) | (~df.primary_job_description.isin(\n", + " ['Education', 'Custodial', 'Clerical or administrative support', 'Sales or service'\n", + " 'Food service', 'Medical/healthcare', 'Manufacturing, construction, maintenance, or farming',\n", + " 'Other'])), \n", + " 'primary_job_description'\n", + " ] = 'Other'\n", "\n", " return df" ] @@ -1622,6 +1641,16 @@ "filtered_trips.replace({'target': {t: ix+1 for ix, t in enumerate(targets)}}, inplace=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe5a1909", + "metadata": {}, + "outputs": [], + "source": [ + "display(filtered_trips.info())" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1629,7 +1658,6 @@ "metadata": {}, "outputs": [], "source": [ - "# savepath = Path('./data/filtered_data')\n", "savepath = Path('./data/filtered_data')\n", "\n", "if not savepath.exists():\n", @@ -1641,7 +1669,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f16fb354", + "id": "065c1911", "metadata": {}, "outputs": [], "source": [] diff --git a/replacement_mode_modeling/02_run_trip_level_models.py b/replacement_mode_modeling/02_run_trip_level_models.py index 3976ee1..95f77ab 100644 --- a/replacement_mode_modeling/02_run_trip_level_models.py +++ b/replacement_mode_modeling/02_run_trip_level_models.py @@ -186,6 +186,10 @@ def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict): X = section_data[X_features] Y = section_data[['section_duration_argmax']] + if section_mode not in model_dict.keys(): + print(f"Inference for section={section_mode} could not be done due to lack of samples. Skipping...") + continue + y_pred = model_dict[section_mode]['model'].predict(X) r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel()) print(f"\t-> Test R2 for {section_mode}: {r2}") @@ -196,6 +200,12 @@ def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict): df['temp'] = 0 for section in df.section_mode_argmax.unique(): + + # Cannot predict because the mode is present in test but not in train. + if section not in model_dict.keys(): + df.loc[df.section_mode_argmax == section, 'temp'] = 0. + continue + X_section = df.loc[df.section_mode_argmax == section, X_features] # broadcast to all columns. @@ -436,8 +446,8 @@ def save_metadata(dir_name: Path, **kwargs): if __name__ == "__main__": - - datasets = sorted(list(Path('./data/filtered_data').glob('preprocessed_data_*.csv'))) + + datasets = sorted(list(Path('../data/filtered_data').glob('preprocessed_data_*.csv'))) start = perf_counter() @@ -447,28 +457,35 @@ def save_metadata(dir_name: Path, **kwargs): print(f"Starting modeling for dataset = {name}") data = pd.read_csv(dataset) - data.drop_duplicates(inplace=True) - data.dropna(inplace=True) if 'deprecatedID' in data.columns: data.drop(columns=['deprecatedID'], inplace=True) if 'data.key' in data.columns: data.drop(columns=['data.key'], inplace=True) - # These two lines make all the difference. - data.sort_values(by=['user_id'], ascending=True, inplace=True) - data = data[sorted(data.columns.tolist())] + print(f"# Samples found: {len(data)}, # unique users: {len(data.user_id.unique())}") print("Beginning sweeps.") # args = parse_args() sweep_number = 1 - root = Path('./outputs/benchmark_results') + root = Path('../outputs/benchmark_results') if not root.exists(): root.mkdir() + + + if 'section_mode_argmax' in data.columns and (data.section_mode_argmax.value_counts() < 2).any(): + # Find which mode. + counts = data.section_mode_argmax.value_counts() + modes = counts[counts < 2].index.tolist() + print(f"Dropping {modes} because of sparsity (<2 samples)") + + data = data.loc[~data.section_mode_argmax.isin(modes), :].reset_index(drop=True) + for split in [SPLIT_TYPE.INTER_USER, SPLIT_TYPE.INTRA_USER, SPLIT_TYPE.TARGET, SPLIT_TYPE.MODE, SPLIT_TYPE.HIDE_USER]: + kwargs = { 'dataset': name, 'split': split diff --git a/replacement_mode_modeling/03_user_level_models.ipynb b/replacement_mode_modeling/03_user_level_models.ipynb index 616cd5e..1f17a91 100644 --- a/replacement_mode_modeling/03_user_level_models.ipynb +++ b/replacement_mode_modeling/03_user_level_models.ipynb @@ -309,118 +309,86 @@ " trip_group_key = trip_kwargs.pop('trip_grouping', 'section_mode_argmax')\n", " \n", " demographics = {\n", - " 'allceo': [\n", - " 'has_drivers_license', 'is_student', 'is_paid', 'income_category',\n", - " 'n_residence_members', 'n_residents_u18', 'n_residents_with_license',\n", - " 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs', 'n_working_residents',\n", - " \"highest_education_Bachelor's degree\",\n", - " 'highest_education_Graduate degree or professional degree',\n", - " 'highest_education_High school graduate or GED',\n", - " 'highest_education_Less than a high school graduate',\n", - " 'highest_education_Prefer not to say',\n", - " 'highest_education_Some college or associates degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Custodial',\n", - " 'primary_job_description_Education',\n", - " 'primary_job_description_Food service',\n", - " 'primary_job_description_Linecook',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Medical/healthcare',\n", - " 'primary_job_description_Non-profit program manager',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, managerial, or technical',\n", - " 'primary_job_description_Sales or service',\n", - " 'primary_job_description_Self employed',\n", - " 'primary_job_description_food service', 'gender_Man',\n", - " 'gender_Nonbinary/genderqueer/genderfluid', 'gender_Prefer not to say',\n", - " 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", - " 'age_16___20_years_old', 'age_21___25_years_old',\n", - " 'age_26___30_years_old', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_46___50_years_old', 'age_51___55_years_old',\n", - " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old',\n", - " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail',\n", - " 'av_unknown', 'av_walk', 'av_car', 'av_s_car'\n", + " 'allceo': [ \n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", + " 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles',\n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'n_working_residents', \n", + " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", + " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', 'gender_Man', \n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', \n", + " 'gender_Prefer not to say', 'gender_Test', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + " 'age_16___20_years_old', 'age_1___5_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old', \n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old', \n", + " 'av_s_car', 'av_walk', 'av_ridehail', 'av_s_micro', 'av_transit', 'av_no_trip', 'av_car', 'av_unknown', \n", + " 'av_p_micro'\n", " ],\n", " 'durham': [\n", - " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18',\n", - " 'n_residence_members', 'income_category',\n", - " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles',\n", - " 'has_medical_condition', 'ft_job', 'multiple_jobs',\n", - " 'highest_education_bachelor_s_degree',\n", - " 'highest_education_graduate_degree_or_professional_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'primary_job_description_Sales or service', 'gender_man',\n", - " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman',\n", - " 'age_16___20_years_old', 'age_21___25_years_old',\n", - " 'age_26___30_years_old', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_51___55_years_old', 'age_56___60_years_old', 'av_walk',\n", - " 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car',\n", - " 'av_ridehail', 'av_s_micro', 'av_s_car'\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', 'income_category',\n", + " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', 'ft_job',\n", + " 'multiple_jobs', 'highest_education_bachelor_s_degree', 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree', 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man', 'gender_non_binary_genderqueer_gender_non_confor',\n", + " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old',\n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'av_unknown', 'av_no_trip', 'av_s_micro', 'av_s_car', 'av_car',\n", + " 'av_p_micro', 'av_walk', 'av_transit', 'av_ridehail'\n", " ],\n", " 'nicr': [\n", - " 'is_student', 'is_paid',\n", - " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", - " 'income_category', 'n_residents_with_license',\n", - " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_prefer_not_to_say', 'primary_job_description_Other',\n", - " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'av_p_micro',\n", - " 'av_car', 'av_transit', 'av_ridehail', 'av_no_trip', 'av_s_car',\n", - " 'av_s_micro', 'av_unknown', 'av_walk'\n", + " \n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", + " 'income_category', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', \n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_prefer_not_to_say', \n", + " 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Other', \n", + " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'av_s_car', 'av_no_trip', 'av_s_micro', 'av_walk', 'av_unknown', 'av_p_micro', 'av_transit', 'av_car', \n", + " 'av_ridehail'\n", " ],\n", " 'masscec': [\n", - " 'is_student', 'is_paid',\n", - " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", - " 'income_category', 'n_residents_with_license',\n", - " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree',\n", - " 'highest_education_graduate_degree_or_professional_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_prefer_not_to_say',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Prefer not to say',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'primary_job_description_Sales or service', 'gender_man',\n", - " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old',\n", - " 'age_21___25_years_old', 'age_26___30_years_old',\n", - " 'age_31___35_years_old', 'age_36___40_years_old',\n", - " 'age_41___45_years_old', 'age_46___50_years_old',\n", - " 'age_51___55_years_old', 'age_56___60_years_old',\n", - " 'age_61___65_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car',\n", - " 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown',\n", - " 'av_ridehail', 'av_walk'\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", + " 'income_category', 'n_residents_with_license', 'n_working_residents', \n", + " 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', \n", + " 'highest_education_bachelor_s_degree', 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged', \n", + " 'highest_education_less_than_a_high_school_graduate', 'highest_education_prefer_not_to_say', \n", + " 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', \n", + " 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', \n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_prefer_not_to_say', 'gender_woman', \n", + " 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', \n", + " 'age_46___50_years_old', 'age_51___55_years_old', 'age_56___60_years_old', \n", + " 'age_61___65_years_old', 'age___65_years_old', 'av_no_trip', 'av_transit', \n", + " 'av_ridehail', 'av_walk', 'av_car', 'av_p_micro', 'av_unknown', 'av_s_micro', 'av_s_car'\n", " ],\n", " 'ride2own': [\n", - " 'has_drivers_license', 'is_student',\n", - " 'is_paid', 'income_category', 'n_residence_members',\n", - " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license',\n", - " 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs',\n", - " 'highest_education_bachelor_s_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'gender_man', 'gender_woman', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_51___55_years_old', 'av_no_trip', 'av_s_micro', 'av_transit',\n", - " 'av_car', 'av_ridehail', 'av_p_micro', 'av_s_car', 'av_walk',\n", - " 'av_unknown'\n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", + " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles', \n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_graduate_degree_or_professional_degree', \n", + " 'highest_education_high_school_graduate_or_ged', \n", + " 'highest_education_less_than_a_high_school_graduate', \n", + " 'highest_education_some_college_or_associates_degree', 'primary_job_description_Other', \n", + " 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', \n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman', 'age_16___20_years_old', \n", + " 'age_21___25_years_old', 'age_26___30_years_old', 'age_31___35_years_old', \n", + " 'age_36___40_years_old', 'age_41___45_years_old', 'age_51___55_years_old', \n", + " 'age_56___60_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car', 'av_car', \n", + " 'av_ridehail', 'av_walk', 'av_transit', 'av_no_trip', 'av_s_micro', 'av_unknown'\n", " ]\n", " }\n", " \n", diff --git a/replacement_mode_modeling/04_FeatureClustering.ipynb b/replacement_mode_modeling/04_FeatureClustering.ipynb index 1ee33f6..0c222fc 100644 --- a/replacement_mode_modeling/04_FeatureClustering.ipynb +++ b/replacement_mode_modeling/04_FeatureClustering.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "789df947", "metadata": {}, "outputs": [], @@ -18,6 +18,7 @@ "import matplotlib.colors as mcolors\n", "import seaborn as sns\n", "\n", + "from pathlib import Path\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances\n", "from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score\n", @@ -39,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "aea4dda7", "metadata": {}, "outputs": [], @@ -58,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "33ef3275", "metadata": {}, "outputs": [], @@ -67,18 +68,21 @@ "PATH = DATA_SOURCE[DB_NUMBER][0]\n", "CURRENT_DB = DATA_SOURCE[DB_NUMBER][1]\n", "\n", + "OUTPUT_DIR = Path('./outputs')\n", + "\n", + "if not OUTPUT_DIR.exists():\n", + " OUTPUT_DIR.mkdir()\n", + "\n", "df = pd.read_csv(PATH)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "d6f69976", "metadata": {}, "outputs": [], "source": [ - "df.dropna(inplace=True)\n", - "\n", "not_needed = ['deprecatedID', 'data.key']\n", "\n", "for col in not_needed:\n", @@ -88,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "b2281bdc", "metadata": {}, "outputs": [], @@ -101,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "9c22d6ac", "metadata": {}, "outputs": [], @@ -113,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "063f6124", "metadata": {}, "outputs": [], @@ -123,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "cef8d45b", "metadata": {}, "outputs": [], @@ -137,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "68c6af2d", "metadata": {}, "outputs": [], @@ -147,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "eff378a7", "metadata": {}, "outputs": [], @@ -157,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "cffbd401", "metadata": {}, "outputs": [], @@ -167,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "f1eb1633", "metadata": {}, "outputs": [], @@ -181,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "d9cc0a0f", "metadata": { "scrolled": true @@ -197,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "750fbd0c", "metadata": {}, "outputs": [], @@ -210,223 +214,17 @@ "\n", "figure1_df['n_trips'] = min_max_normalize(figure1_df['n_trips'])\n", "figure1_df['start:hour'] = np.sin(figure1_df['start:hour'].values)\n", - "figure1_df['end:hour'] = np.sin(figure1_df['end:hour'].values)" + "figure1_df['end:hour'] = np.sin(figure1_df['end:hour'].values)\n", + "\n", + "figure1_df.fillna(0., inplace=True)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "1c3d1849", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
coverage_bicyclingcoverage_carcoverage_transitcoverage_unknowncoverage_walkingpct_distance_bicyclingpct_distance_carpct_distance_transitpct_distance_unknownpct_distance_walkingn_tripsstart:hourend:hour
user_id
0600d3df-c1aa-4ca2-83f2-1f6b8931280d0.0500000.8500000.00.0000000.1000000.0092820.9849540.00.0000000.0057640.0833330.4121180.650288
44eda4da-9223-4bb0-afd4-e7dd19fc6b270.0650410.7804880.00.0650410.0894310.0381800.9126930.00.0331710.0159570.7435900.1498770.149877
4c5436e9-4840-4872-9e8f-5d46ba81fe520.0000000.7142860.00.0000000.2857140.0000000.8472470.00.0000000.1527530.0000000.6502880.650288
7479810c-c602-4508-8ae2-da0bed87558d0.1162790.4534880.00.1279070.3023260.0155300.9264890.00.0270850.0308960.5064100.1498770.990607
7f7c9d3b-84ed-4c14-be8a-aa256daaed010.0175440.7719300.00.0350880.1754390.0051570.8543270.00.1100330.0304830.3205130.9906070.990607
\n", - "
" - ], - "text/plain": [ - " coverage_bicycling coverage_car \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.050000 0.850000 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.065041 0.780488 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 0.714286 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.116279 0.453488 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.017544 0.771930 \n", - "\n", - " coverage_transit coverage_unknown \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.0 0.000000 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.0 0.065041 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.0 0.000000 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.0 0.127907 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.0 0.035088 \n", - "\n", - " coverage_walking \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.100000 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.089431 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.285714 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.302326 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.175439 \n", - "\n", - " pct_distance_bicycling \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.009282 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.038180 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.015530 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.005157 \n", - "\n", - " pct_distance_car pct_distance_transit \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.984954 0.0 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.912693 0.0 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.847247 0.0 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.926489 0.0 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.854327 0.0 \n", - "\n", - " pct_distance_unknown \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.000000 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.033171 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.027085 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.110033 \n", - "\n", - " pct_distance_walking n_trips \\\n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.005764 0.083333 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.015957 0.743590 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.152753 0.000000 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.030896 0.506410 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.030483 0.320513 \n", - "\n", - " start:hour end:hour \n", - "user_id \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.412118 0.650288 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.149877 0.149877 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.650288 0.650288 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.149877 0.990607 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.990607 0.990607 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "figure1_df.head()" ] @@ -441,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "598d82bc", "metadata": {}, "outputs": [], @@ -467,18 +265,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "bc89a42d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Counter({0: 8, -1: 4})\n" - ] - } - ], + "outputs": [], "source": [ "'''\n", "AlLCEO: eps=0.542\n", @@ -493,47 +283,12 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "05c9a7c4", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4 users in cluster -1\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAHFCAYAAAAg3/mzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABBqklEQVR4nO3deVhUdf//8dcow7AoqBCKiuKSippLWkmlqCTklpXelpVb2je/WX2VrG401/LGyrptUanbLbe0Is2Fu6QS0sSSUlNTW24VM6jUFNMcRji/P7yYn+MMy6B2hPv5uC6uy/OZzzmf95w55/jiLIzFMAxDAAAAJqlidgEAAOC/G2EEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADBVpQkjixYtksVicf74+PgoPDxc9957r77//vsrPv6wYcMUGRl5xccpr6L1c/DgQbNLKVF6erosFovS09PNLuW/jqd1X57t+ueff9aUKVO0Y8cOr+bzNJbFYtGjjz7q1XJKM2fOHC1atMit/eDBg7JYLB5f+yu89tpratq0qXx9fWWxWHTixAlT6qhoLBaLpkyZ8peOeaWPp1u2bNGUKVOumm3gmWeeUZ8+fVSvXj1ZLBYNGzbsso9RacJIkYULFyozM1Mff/yxHn30Ua1Zs0a33nqrfv/9d7NLAyqciRMnatWqVV7N8/PPP2vq1Kleh5HyjFUexYWR8PBwZWZmqnfv3le8hovt2LFDjz/+uLp166ZPP/1UmZmZql69+l9eR0WUmZmpkSNHml3GZbVlyxZNnTr1qgkj//znP3Xs2DHdcccd8vX1vSJj+FyRpZqodevW6tixoySpa9euKigo0OTJk7V69WoNHz7c5OpQkRiGobNnz8rf39/sUkzTpEmTKz7GmTNnFBAQ8JeMVRKbzaZOnTqZMvaePXskSQ899JBuvPHGy7LMovV6Nbsc+5hZn1lFVN5t4tSpU6pS5fy5iyVLllzusiRVwjMjFysKJr/88otLe1ZWlu644w7VqlVLfn5+at++vd555x2XPkWn4tLS0jR8+HDVqlVLgYGB6tu3r/7zn/+UOvbs2bPVpUsXhYWFKTAwUNddd51eeOEFORwOt74ffvihYmNjFRwcrICAAEVFRSkpKcnrmiVp69atuuWWW+Tn56e6desqMTHR45ieDBs2TNWqVdO+ffsUHx+vwMBAhYeHa8aMGc5l33rrrQoMDFSzZs301ltvuS1j9+7d6tevn2rWrCk/Pz+1a9fOY799+/bp9ttvV0BAgEJDQzVq1CidOnXKY10ff/yxYmNjFRQUpICAAN1yyy365JNPSn0/Z8+e1RNPPKF27dopODhYtWrVUnR0tD744AO3vkWXBJKTkxUVFSWbzease/PmzYqOjpafn5/q1auniRMnat68eW6naiMjI9WnTx+tW7dO7du3l7+/v6KiorRu3TpJ57epqKgoBQYG6sYbb1RWVpZLDVlZWbr33nsVGRkpf39/RUZGatCgQTp06JCzj2EY6tWrl0JCQpSdne1sP3PmjFq1aqWoqCidPn26xPVS1nXv6dLJu+++q5tuusm5rTZu3FgPPvigpPOXem644QZJ0vDhw52XTYtOoxdtX7t27VJcXJyqV6+u2NjYYscq8sYbb6hZs2ay2Wxq2bKlVqxY4fL6lClTZLFY3Oa7+HR6ZGSk9uzZo4yMDGdtRWMWd5lm8+bNio2NVfXq1RUQEKCbb75Z69ev9zjOxo0b9b//+78KDQ1VSEiI7r77bv38888e31ORrl276oEHHpAk3XTTTW6nwRcsWKC2bdvKz89PtWrV0l133aW9e/e6LKOk9epJceva03os6fMukpeXp3HjxqlRo0by9fVVvXr1NGbMGLftsKR9bO7cuWrbtq2qVaum6tWrq0WLFho/fnyJ665omRdeprmUz6LIF198ob59+yokJER+fn5q0qSJxowZU+I8kZGRHi9fdO3aVV27dnVOFxYW6rnnnlPz5s3l7++vGjVqqE2bNnrllVcknf8MnnzySUlSo0aNnNvphZdPV65cqejoaAUGBqpatWqKj4/X9u3bXcb1dpsoSVEQuZIq3ZmRix04cECS1KxZM2fbxo0bdfvtt+umm25ScnKygoODtWLFCt1zzz06c+aM2wY1YsQI9ejRQ8uXL9fhw4f1zDPPqGvXrvrmm29Uo0aNYsf+8ccfdd999zl30J07d2r69Onat2+fFixY4Ow3f/58PfTQQ4qJiVFycrLCwsL03Xffaffu3V7X/O233yo2NlaRkZFatGiRAgICNGfOHC1fvrzM68zhcOjuu+/WqFGj9OSTT2r58uVKTExUXl6eUlJS9PTTT6t+/fp67bXXNGzYMLVu3VodOnSQJO3fv18333yzwsLC9OqrryokJERLly7VsGHD9Msvv+ipp56SdD4cxsTEyGq1as6cOapdu7aWLVvm8f6ApUuXasiQIerXr5/eeustWa1WvfHGG4qPj9dHH31U4g5mt9t1/PhxjRs3TvXq1VN+fr4+/vhj3X333Vq4cKGGDBni0n/16tXatGmTJk2apDp16igsLEzffPONevTo4QxfAQEBSk5O1tKlSz2OuXPnTiUmJmrChAkKDg7W1KlTdffddysxMVGffPKJ/vGPf8hisejpp59Wnz59dODAAedvhgcPHlTz5s117733qlatWsrJydHcuXN1ww036Ntvv1VoaKgsFouWLFmidu3aaeDAgdq0aZOsVqseeeQRHThwQF988YUCAwOLXSferPuLZWZm6p577tE999yjKVOmyM/PT4cOHdKnn34qSbr++uu1cOFCDR8+XM8884zzkkf9+vWdy8jPz9cdd9yhhx9+WH//+9917ty5Esdcs2aNNm7cqGnTpikwMFBz5szRoEGD5OPjowEDBpRa84VWrVqlAQMGKDg4WHPmzJF0/oxIcTIyMtSjRw+1adNG8+fPl81m05w5c9S3b1+9/fbbuueee1z6jxw5Ur1793YeK5588kk98MADzvXjyZw5c/T222/rueee08KFC9WiRQtdc801kqSkpCSNHz9egwYNUlJSko4dO6YpU6YoOjpa27Zt07XXXutcjrfrtSxK+7yl8yE4JiZGP/30k8aPH682bdpoz549mjRpknbt2qWPP/7YJeB42sdWrFihRx55RI899phmzpypKlWq6IcfftC3335b7trL81lI0kcffaS+ffsqKipKL7/8sho0aKCDBw9qw4YN5a7lQi+88IKmTJmiZ555Rl26dJHD4dC+ffucl2RGjhyp48eP67XXXtP777+v8PBwSVLLli0lSf/4xz/0zDPPOPex/Px8vfjii+rcubO+/PJLZz/pymwTV4xRSSxcuNCQZGzdutVwOBzGqVOnjA8//NCoU6eO0aVLF8PhcDj7tmjRwmjfvr1Lm2EYRp8+fYzw8HCjoKDAZZl33XWXS7/PP//ckGQ899xzzrahQ4caDRs2LLa+goICw+FwGIsXLzaqVq1qHD9+3DAMwzh16pQRFBRk3HrrrUZhYWGx85e15nvuucfw9/c3cnNznX3OnTtntGjRwpBkHDhwoNgxit6HJCMlJcXZ5nA4jGuuucaQZHz99dfO9mPHjhlVq1Y1EhISnG333nuvYbPZjOzsbJfl9uzZ0wgICDBOnDhhGIZhPP3004bFYjF27Njh0q9Hjx6GJGPjxo2GYRjG6dOnjVq1ahl9+/Z16VdQUGC0bdvWuPHGG0t8Pxc7d+6c4XA4jBEjRhjt27d3eU2SERwc7Pxsivztb38zAgMDjd9++81l/JYtW7qt04YNGxr+/v7GTz/95GzbsWOHIckIDw83Tp8+7WxfvXq1IclYs2ZNifX+8ccfRmBgoPHKK6+4vLZ582bDx8fHGDNmjLFgwQJDkjFv3rxS10FZ171huG/XM2fONCQ5P0dPtm3bZkgyFi5c6PZa0fa1YMECj69dvA9JKnZ7btq0qbNt8uTJhqfDWdE+fOFn1KpVKyMmJsat74EDB9zq7tSpkxEWFmacOnXKZfzWrVsb9evXd+6zReM88sgjLst84YUXDElGTk6O23ie6ty2bZuz7ffffzf8/f2NXr16ufTNzs42bDabcd999znbSlqvnhR3vLp4PZbl805KSjKqVKniUrthGMZ7771nSDJSU1OdbcXtY48++qhRo0aNMtV+MUnG5MmTndOX+lk0adLEaNKkifHnn38W28fTdtWwYUNj6NChbn1jYmJctrc+ffoY7dq1K7GGF1980ePxOjs72/Dx8TEee+wxl/ZTp04ZderUMQYOHOhs83abKKvAwECP7/NSVbrLNJ06dZLValX16tV1++23q2bNmvrggw/k43P+JNAPP/ygffv26f7775cknTt3zvnTq1cv5eTkaP/+/S7LLOpb5Oabb1bDhg21cePGEmvZvn277rjjDoWEhKhq1aqyWq0aMmSICgoK9N1330k6f6NSXl6eHnnkEY+nmb2teePGjYqNjVXt2rWd81etWtXtN7iSWCwW9erVyznt4+Ojpk2bKjw8XO3bt3e216pVS2FhYS6XED799FPFxsYqIiLCZZnDhg3TmTNnlJmZ6ayzVatWatu2rUu/++67z2V6y5YtOn78uIYOHeryvgsLC3X77bdr27ZtpV6SePfdd3XLLbeoWrVq8vHxkdVq1fz5891OdUtS9+7dVbNmTZe2jIwMde/eXaGhoc62KlWqaODAgR7Ha9eunerVq+ecjoqKknT+dO2F12uL2i9cf3/88YeefvppNW3aVD4+PvLx8VG1atV0+vRpt3pvueUWTZ8+XbNmzdL//u//6oEHHtCIESNKXBdS2de9J0WXYAYOHKh33nlHR44cKXUeT/r371/mvsVtzz/88IN++umnco1fFqdPn9YXX3yhAQMGqFq1ai7jDx48WD/99JPbseKOO+5wmW7Tpo0k18+4rDIzM/Xnn3+6namNiIhQ9+7dPV6m9Ga9lkVZPu9169apdevWateuncs+Gh8f7/HJOE/72I033qgTJ05o0KBB+uCDD3T06NFLrr08n8V3332nH3/8USNGjJCfn98l1+DJjTfeqJ07d+qRRx7RRx99pLy8vDLP+9FHH+ncuXMaMmSIy7r28/NTTEyMx6cQy7pNXLi8c+fOyTCMMtd1OVS6MLJ48WJt27ZNn376qR5++GHt3btXgwYNcr5edO/IuHHjZLVaXX4eeeQRSXLbEerUqeM2Tp06dXTs2LFi68jOzlbnzp115MgRvfLKK9q0aZO2bdum2bNnS5L+/PNPSdJvv/0myfU09sW8qfnYsWPF1ltWAQEBbjuir6+vatWq5dbX19dXZ8+edU4fO3bMeVrxQnXr1nW+7k2dRe99wIABbu/9+eefl2EYOn78eLHv5f3339fAgQNVr149LV26VJmZmdq2bZsefPBBl7qLeKr92LFjLv8ZFvHUJsltPRXdfV5c+4V13HfffXr99dc1cuRIffTRR/ryyy+1bds2XXPNNc5t5kL333+/fH19ZbfbndeZS3Mp20iXLl20evVq5wGxfv36at26td5+++0yjS2d376CgoLK3L+kWkvaBy/V77//LsMwyrQ9FwkJCXGZLroE5OmzK03Rsosb/+KxvV2vZVGWz/uXX37RN99847Z/Vq9eXYZhuB1PPb2fwYMHa8GCBTp06JD69++vsLAw3XTTTUpLSyt37eX5LMpyPL5UiYmJmjlzprZu3aqePXsqJCREsbGxbvePeVJ0PLzhhhvc1vfKlSvd1nVZt4mDBw+6LS8jI6N8b7CcKt09I1FRUc6bVrt166aCggLNmzdP7733ngYMGOD87TYxMVF33323x2U0b97cZTo3N9etT25urpo2bVpsHatXr9bp06f1/vvvq2HDhs72ix93LLo2XNJveN7UHBISUmy9f4WQkBDl5OS4tRfdOFb0XspaZ1H/1157rdi75osLBdL5+00aNWqklStXupx5stvtHvt7OjsVEhLidgO0p1ov1cmTJ7Vu3TpNnjxZf//7353tRfe9XKygoED333+/atasKZvNphEjRujzzz8v9dG7S91G+vXrp379+slut2vr1q1KSkrSfffdp8jISEVHR5c6f3FnAItTUq1F/+EUhWe73e5yD8il/IZds2ZNValSpUzb85VQ9N6KG//isb1Zr35+fh73AU/rq7TPOzQ0VP7+/i73wV2orHUOHz5cw4cP1+nTp/XZZ59p8uTJ6tOnj7777juXY+iVVJbjcXFKWqcXrgMfHx8lJCQoISFBJ06c0Mcff6zx48crPj5ehw8fLvFpl6LlvPfee2VaJ2XdJurWratt27a5tF38/+CVVunOjFzshRdeUM2aNTVp0iQVFhaqefPmuvbaa7Vz50517NjR48/Fz/cvW7bMZXrLli06dOiQyx3SFyvaCC48MBqGoX/9618u/W6++WYFBwcrOTm52NNi3tTcrVs3ffLJJy7/eRYUFGjlypWlr6zLIDY2Vp9++qnbXeuLFy9WQECAM1B069ZNe/bs0c6dO136XXyj7S233KIaNWro22+/Lfa9l/Sfr8Vicf4RqSK5ubken6YpTkxMjD799FOXA3VhYaHefffdMi+jLCwWiwzDcLuhct68eSooKHDrP3nyZG3atEnLli3TypUrtXPnzjKdHSnrui+NzWZTTEyMnn/+eUly3s1/KWcDPClue27SpInzN9iiJ0O++eYbl3nXrl3rse6y1BYYGKibbrpJ77//vkv/wsJCLV26VPXr13e5Mf5yi46Olr+/v9uN0j/99JPzcmh5RUZG6tdff3VZr/n5+froo4+Knae4z7tPnz768ccfFRIS4nH/9PaP5gUGBqpnz56aMGGC8vPznY89/xWaNWumJk2aaMGCBcX+wlKcyMhIt+3vu+++c7uUd6EaNWpowIABGj16tI4fP+586qu4fSg+Pl4+Pj768ccfiz0eloevr2+p/w9eaZXuzMjFatasqcTERD311FNavny5HnjgAb3xxhvq2bOn4uPjNWzYMNWrV0/Hjx/X3r179fXXX7v9J5OVlaWRI0fqb3/7mw4fPqwJEyaoXr16zksknvTo0UO+vr4aNGiQnnrqKZ09e1Zz5851++Nr1apV00svvaSRI0fqtttu00MPPaTatWvrhx9+0M6dO/X6669LUplrfuaZZ7RmzRp1795dkyZNUkBAgGbPnl3qfRWXy+TJk7Vu3Tp169ZNkyZNUq1atbRs2TKtX79eL7zwgoKDgyVJY8aM0YIFC9S7d28999xzzic69u3b57Z+XnvtNQ0dOlTHjx/XgAEDFBYWpt9++007d+7Ub7/9prlz5xZbT58+ffT+++/rkUce0YABA3T48GE9++yzCg8PL/Nf5p0wYYLWrl2r2NhYTZgwQf7+/kpOTnau08v12FtQUJC6dOmiF198UaGhoYqMjFRGRobmz5/v9tRWWlqakpKSNHHiROd/SklJSRo3bpy6du2qu+66q9hxyrruPZk0aZJ++uknxcbGqn79+jpx4oReeeUVWa1WxcTESDr/t0n8/f21bNkyRUVFqVq1aqpbt67z0oa3QkND1b17d02cONH5NM2+fftcHu/t1auXatWqpREjRmjatGny8fHRokWLdPjwYbflXXfddVqxYoVWrlypxo0by8/PT9ddd53HsZOSktSjRw9169ZN48aNk6+vr+bMmaPdu3fr7bff9vosjzdq1KihiRMnavz48RoyZIgGDRqkY8eOaerUqfLz89PkyZPLvex77rlHkyZN0r333qsnn3xSZ8+e1auvvuoWesvyeY8ZM0YpKSnq0qWLxo4dqzZt2qiwsFDZ2dnasGGDnnjiCd10000l1vPQQw/J399ft9xyi8LDw5Wbm6ukpCQFBwc771v5q8yePVt9+/ZVp06dNHbsWDVo0EDZ2dn66KOP3H4xvdDgwYP1wAMP6JFHHlH//v116NAhvfDCC86zLUX69u3r/HtY11xzjQ4dOqRZs2apYcOGzqejirbHV155RUOHDpXValXz5s0VGRmpadOmacKECfrPf/7jvC/yl19+0ZdffqnAwEBNnTr1sq+TjIwM5yWsgoICHTp0SO+9956k87+sXfwey+Wy3xJrEk93oxf5888/jQYNGhjXXnutce7cOcMwDGPnzp3GwIEDjbCwMMNqtRp16tQxunfvbiQnJ7stc8OGDcbgwYONGjVqOO9u//77713G8HR3+tq1a422bdsafn5+Rr169Ywnn3zS+Pe//+32xIJhGEZqaqoRExNjBAYGGgEBAUbLli2N559/3qVPWWo2jPNP+3Tq1Mmw2WxGnTp1jCeffNJ48803y/w0TWBgoFt7TEyM0apVK7f2hg0bGr1793Zp27Vrl9G3b18jODjY8PX1Ndq2bevxyYpvv/3W6NGjh+Hn52fUqlXLGDFihPHBBx94XD8ZGRlG7969jVq1ahlWq9WoV6+e0bt3b+Pdd98t8f0YhmHMmDHDiIyMNGw2mxEVFWX861//8vj0hSRj9OjRHpexadMm46abbnJZp88//7zbkwae1kdxyy56euPFF190tv30009G//79jZo1axrVq1c3br/9dmP37t0ud+r//PPPRlhYmNG9e3fnU1SGYRiFhYVG3759jRo1apT6OZd13V+8Xa9bt87o2bOnUa9ePcPX19cICwszevXqZWzatMll+W+//bbRokULw2q1ujztUNz25WmsC9fbnDlzjCZNmhhWq9Vo0aKFsWzZMrf5v/zyS+Pmm282AgMDjXr16hmTJ0825s2b57bdHzx40IiLizOqV69uSHKO6elpGsM4/9l3797dCAwMNPz9/Y1OnToZa9eudelT3PFn48aNHrfni5V0/Jo3b57Rpk0bw9fX1wgODjb69etn7Nmzx23dFbdei5Oammq0a9fO8Pf3Nxo3bmy8/vrrbvtFWT/vP/74w3jmmWeM5s2bO+u87rrrjLFjx7o8CVXcPvbWW28Z3bp1M2rXrm34+voadevWNQYOHGh88803pb4PFfM0TXk/C8MwjMzMTKNnz55GcHCwYbPZjCZNmhhjx451G+PC7aqwsNB44YUXjMaNGxt+fn5Gx44djU8//dTtaZqXXnrJuPnmm43Q0FDD19fXaNCggTFixAjj4MGDLjUkJiYadevWNapUqeJW9+rVq41u3boZQUFBhs1mMxo2bGgMGDDA+Pjjj519yrNNFCcmJsaQ5PGnLOuzLCyG8RffMluBLFq0SMOHD9e2bdvKffoLlVdcXJwOHjzofDIKAFA+lf4yDXA5JCQkqH379oqIiNDx48e1bNkypaWlaf78+WaXBgAVHmEEKIOCggJNmjRJubm5slgsatmypZYsWeL8M94AgPLjMg0AADBVpX+0FwAAXN0IIwAAwFSEEQAAYKoKcQNrYWGhfv75Z1WvXv2K/pEhAABw+RiGoVOnTqlu3bol/oHIChFGfv75Z7dvgQUAABXD4cOHS/wCwgoRRor+Rv7hw4cv+7dSAjCXw+HQhg0bFBcXJ6vVanY5AC6jvLw8RURElPpdNxUijBRdmgkKCiKMAJWMw+FwftU5YQSonEq7xYIbWAEAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVJcURpKSkmSxWDRmzJgS+2VkZKhDhw7y8/NT48aNlZycfCnDAgCASqTcYWTbtm1688031aZNmxL7HThwQL169VLnzp21fft2jR8/Xo8//rhSUlLKOzQAAKhEyhVG/vjjD91///3617/+pZo1a5bYNzk5WQ0aNNCsWbMUFRWlkSNH6sEHH9TMmTPLVTAAAKhcyhVGRo8erd69e+u2224rtW9mZqbi4uJc2uLj45WVlSWHw1Ge4QEAQCXi4+0MK1as0Ndff61t27aVqX9ubq5q167t0la7dm2dO3dOR48eVXh4uNs8drtddrvdOZ2Xlyfp/FeNE2CAyqVon2bfBiqfsu7XXoWRw4cP6//+7/+0YcMG+fn5lXk+i8XiMm0Yhsf2IklJSZo6dapb+4YNGxQQEOBFxQAqirS0NLNLAHCZnTlzpkz9LEZRMiiD1atX66677lLVqlWdbQUFBbJYLKpSpYrsdrvLa5LUpUsXtW/fXq+88oqzbdWqVRo4cKDOnDkjq9XqNo6nMyMRERE6evSogoKCylougArA4XAoLS1NE7OqyF7o+ReUq9HuKfFmlwBc9fLy8hQaGqqTJ0+W+P+3V2dGYmNjtWvXLpe24cOHq0WLFnr66afdgogkRUdHa+3atS5tGzZsUMeOHT0GEUmy2Wyy2Wxu7Vartdh5AFRs9kKL7AUVJ4xwLAJKV9b9xKswUr16dbVu3dqlLTAwUCEhIc72xMREHTlyRIsXL5YkjRo1Sq+//roSEhL00EMPKTMzU/Pnz9fbb7/tzdAAAKCSuux/gTUnJ0fZ2dnO6UaNGik1NVXp6elq166dnn32Wb366qvq37//5R4aAABUQF4/TXOx9PR0l+lFixa59YmJidHXX399qUMBAIBKiO+mAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACm8iqMzJ07V23atFFQUJCCgoIUHR2tf//738X2T09Pl8VicfvZt2/fJRcOAAAqBx9vOtevX18zZsxQ06ZNJUlvvfWW+vXrp+3bt6tVq1bFzrd//34FBQU5p6+55ppylgsAACobr8JI3759XaanT5+uuXPnauvWrSWGkbCwMNWoUaNcBQIAgMqt3PeMFBQUaMWKFTp9+rSio6NL7Nu+fXuFh4crNjZWGzduLO+QAACgEvLqzIgk7dq1S9HR0Tp79qyqVaumVatWqWXLlh77hoeH680331SHDh1kt9u1ZMkSxcbGKj09XV26dCl2DLvdLrvd7pzOy8uTJDkcDjkcDm9LBnAVK9qnbVUMkyvxDscioHRl3U8shmF4dQTIz89Xdna2Tpw4oZSUFM2bN08ZGRnFBpKL9e3bVxaLRWvWrCm2z5QpUzR16lS39uXLlysgIMCbcgEAgEnOnDmj++67TydPnnS5d/RiXoeRi912221q0qSJ3njjjTL1nz59upYuXaq9e/cW28fTmZGIiAgdPXq0xDcDoOJxOBxKS0vTxKwqshdazC6nzHZPiTe7BOCql5eXp9DQ0FLDiNeXaS5mGIZLcCjN9u3bFR4eXmIfm80mm83m1m61WmW1Wr2uEcDVz15okb2g4oQRjkVA6cq6n3gVRsaPH6+ePXsqIiJCp06d0ooVK5Senq4PP/xQkpSYmKgjR45o8eLFkqRZs2YpMjJSrVq1Un5+vpYuXaqUlBSlpKR4+XYAAEBl5VUY+eWXXzR48GDl5OQoODhYbdq00YcffqgePXpIknJycpSdne3sn5+fr3HjxunIkSPy9/dXq1attH79evXq1evyvgsAAFBhXfI9I3+FvLw8BQcHl3rNCUDF43A4lJqaqqe+rFqhLtMcnNHb7BKAq15Z///mu2kAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKm8CiNz585VmzZtFBQUpKCgIEVHR+vf//53ifNkZGSoQ4cO8vPzU+PGjZWcnHxJBQMAgMrFqzBSv359zZgxQ1lZWcrKylL37t3Vr18/7dmzx2P/AwcOqFevXurcubO2b9+u8ePH6/HHH1dKSsplKR4AAFR8Pt507tu3r8v09OnTNXfuXG3dulWtWrVy65+cnKwGDRpo1qxZkqSoqChlZWVp5syZ6t+/f/mrBgAAlUa57xkpKCjQihUrdPr0aUVHR3vsk5mZqbi4OJe2+Ph4ZWVlyeFwlHdoAABQiXh1ZkSSdu3apejoaJ09e1bVqlXTqlWr1LJlS499c3NzVbt2bZe22rVr69y5czp69KjCw8M9zme322W3253TeXl5kiSHw0GIASqZon3aVsUwuRLvcCwCSlfW/cTrMNK8eXPt2LFDJ06cUEpKioYOHaqMjIxiA4nFYnGZNgzDY/uFkpKSNHXqVLf2DRs2KCAgwNuSAVQAz3YsNLsEr6SmpppdAnDVO3PmTJn6WYyidFBOt912m5o0aaI33njD7bUuXbqoffv2euWVV5xtq1at0sCBA3XmzBlZrVaPy/R0ZiQiIkJHjx5VUFDQpZQL4CrjcDiUlpamiVlVZC8s/peUq83uKfFmlwBc9fLy8hQaGqqTJ0+W+P+312dGLmYYhktwuFB0dLTWrl3r0rZhwwZ17Nix2CAiSTabTTabza3darWWOB+AisteaJG9oOKEEY5FQOnKup94dQPr+PHjtWnTJh08eFC7du3ShAkTlJ6ervvvv1+SlJiYqCFDhjj7jxo1SocOHVJCQoL27t2rBQsWaP78+Ro3bpw3wwIAgErMqzMjv/zyiwYPHqycnBwFBwerTZs2+vDDD9WjRw9JUk5OjrKzs539GzVqpNTUVI0dO1azZ89W3bp19eqrr/JYLwAAcPIqjMyfP7/E1xctWuTWFhMTo6+//tqrogAAwH8PvpsGAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJjKqzCSlJSkG264QdWrV1dYWJjuvPNO7d+/v8R50tPTZbFY3H727dt3SYUDAIDKwaswkpGRodGjR2vr1q1KS0vTuXPnFBcXp9OnT5c67/79+5WTk+P8ufbaa8tdNAAAqDx8vOn84YcfukwvXLhQYWFh+uqrr9SlS5cS5w0LC1ONGjW8LhAAAFRul3TPyMmTJyVJtWrVKrVv+/btFR4ertjYWG3cuPFShgUAAJWIV2dGLmQYhhISEnTrrbeqdevWxfYLDw/Xm2++qQ4dOshut2vJkiWKjY1Venp6sWdT7Ha77Ha7czovL0+S5HA45HA4ylsygKtQ0T5tq2KYXIl3OBYBpSvrfmIxDKNcR4DRo0dr/fr12rx5s+rXr+/VvH379pXFYtGaNWs8vj5lyhRNnTrVrX358uUKCAgoT7kAAOAvdubMGd133306efKkgoKCiu1XrjDy2GOPafXq1frss8/UqFEjr4ubPn26li5dqr1793p83dOZkYiICB09erTENwOg4nE4HEpLS9PErCqyF1rMLqfMdk+JN7sE4KqXl5en0NDQUsOIV5dpDMPQY489plWrVik9Pb1cQUSStm/frvDw8GJft9lsstlsbu1Wq1VWq7VcYwK4utkLLbIXVJwwwrEIKF1Z9xOvwsjo0aO1fPlyffDBB6pevbpyc3MlScHBwfL395ckJSYm6siRI1q8eLEkadasWYqMjFSrVq2Un5+vpUuXKiUlRSkpKd4MDQAAKimvwsjcuXMlSV27dnVpX7hwoYYNGyZJysnJUXZ2tvO1/Px8jRs3TkeOHJG/v79atWql9evXq1evXpdWOQAAqBS8vkxTmkWLFrlMP/XUU3rqqae8KgoAAPz34LtpAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpvAojSUlJuuGGG1S9enWFhYXpzjvv1P79+0udLyMjQx06dJCfn58aN26s5OTkchcMAAAqF6/CSEZGhkaPHq2tW7cqLS1N586dU1xcnE6fPl3sPAcOHFCvXr3UuXNnbd++XePHj9fjjz+ulJSUSy4eAABUfD7edP7www9dphcuXKiwsDB99dVX6tKli8d5kpOT1aBBA82aNUuSFBUVpaysLM2cOVP9+/cvX9UAAKDSuKR7Rk6ePClJqlWrVrF9MjMzFRcX59IWHx+vrKwsORyOSxkeAABUAl6dGbmQYRhKSEjQrbfeqtatWxfbLzc3V7Vr13Zpq127ts6dO6ejR48qPDzcbR673S673e6czsvLkyQ5HA4CDFDJFO3TtiqGyZV4h2MRULqy7iflDiOPPvqovvnmG23evLnUvhaLxWXaMAyP7UWSkpI0depUt/YNGzYoICCgHNUCuNo927HQ7BK8kpqaanYJwFXvzJkzZepXrjDy2GOPac2aNfrss89Uv379EvvWqVNHubm5Lm2//vqrfHx8FBIS4nGexMREJSQkOKfz8vIUERGhuLg4BQUFladkAFcph8OhtLQ0TcyqInuh519Qrka7p8SbXQJw1Su6slEar8KIYRh67LHHtGrVKqWnp6tRo0alzhMdHa21a9e6tG3YsEEdO3aU1Wr1OI/NZpPNZnNrt1qtxc4DoGKzF1pkL6g4YYRjEVC6su4nXt3AOnr0aC1dulTLly9X9erVlZubq9zcXP3555/OPomJiRoyZIhzetSoUTp06JASEhK0d+9eLViwQPPnz9e4ceO8GRoAAFRSXoWRuXPn6uTJk+ratavCw8OdPytXrnT2ycnJUXZ2tnO6UaNGSk1NVXp6utq1a6dnn31Wr776Ko/1AgAASeW4TFOaRYsWubXFxMTo66+/9mYoAADwX4LvpgEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApvI6jHz22Wfq27ev6tatK4vFotWrV5fYPz09XRaLxe1n37595a0ZAABUIj7eznD69Gm1bdtWw4cPV//+/cs83/79+xUUFOScvuaaa7wdGgAAVEJeh5GePXuqZ8+eXg8UFhamGjVqeD0fAACo3P6ye0bat2+v8PBwxcbGauPGjX/VsAAA4Crn9ZkRb4WHh+vNN99Uhw4dZLfbtWTJEsXGxio9PV1dunTxOI/dbpfdbndO5+XlSZIcDoccDseVLhnAX6hon7ZVMUyuxDsci4DSlXU/sRiGUe4jgMVi0apVq3TnnXd6NV/fvn1lsVi0Zs0aj69PmTJFU6dOdWtfvny5AgICylMqAAD4i505c0b33XefTp486XLf6MWu+JkRTzp16qSlS5cW+3piYqISEhKc03l5eYqIiFBcXFyJbwZAxeNwOJSWlqaJWVVkL7SYXU6Z7Z4Sb3YJwFWv6MpGaUwJI9u3b1d4eHixr9tsNtlsNrd2q9Uqq9V6JUsDYBJ7oUX2gooTRjgWAaUr637idRj5448/9MMPPzinDxw4oB07dqhWrVpq0KCBEhMTdeTIES1evFiSNGvWLEVGRqpVq1bKz8/X0qVLlZKSopSUFG+HBgAAlZDXYSQrK0vdunVzThddThk6dKgWLVqknJwcZWdnO1/Pz8/XuHHjdOTIEfn7+6tVq1Zav369evXqdRnKBwAAFd0l3cD6V8nLy1NwcHCpN8AAqHgcDodSU1P11JdVK9RlmoMzeptdAnDVK+v/33w3DQAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwlddh5LPPPlPfvn1Vt25dWSwWrV69utR5MjIy1KFDB/n5+alx48ZKTk4uT60AAKAS8jqMnD59Wm3bttXrr79epv4HDhxQr1691LlzZ23fvl3jx4/X448/rpSUFK+LBQAAlY+PtzP07NlTPXv2LHP/5ORkNWjQQLNmzZIkRUVFKSsrSzNnzlT//v29HR4AAFQyV/yekczMTMXFxbm0xcfHKysrSw6H40oPDwAArnJenxnxVm5urmrXru3SVrt2bZ07d05Hjx5VeHi42zx2u112u905nZeXJ0lyOBwEGKCSKdqnbVUMkyvxDscioHRl3U+ueBiRJIvF4jJtGIbH9iJJSUmaOnWqW/uGDRsUEBBw+QsEYLpnOxaaXYJXUlNTzS4BuOqdOXOmTP2ueBipU6eOcnNzXdp+/fVX+fj4KCQkxOM8iYmJSkhIcE7n5eUpIiJCcXFxCgoKuqL1AvhrORwOpaWlaWJWFdkLPf+CcjXaPSXe7BKAq17RlY3SXPEwEh0drbVr17q0bdiwQR07dpTVavU4j81mk81mc2u3Wq3FzgOgYrMXWmQvqDhhhGMRULqy7ide38D6xx9/aMeOHdqxY4ek84/u7tixQ9nZ2ZLOn9UYMmSIs/+oUaN06NAhJSQkaO/evVqwYIHmz5+vcePGeTs0AACohLw+M5KVlaVu3bo5p4supwwdOlSLFi1STk6OM5hIUqNGjZSamqqxY8dq9uzZqlu3rl599VUe6wUAAJLKEUa6du3qvAHVk0WLFrm1xcTE6Ouvv/Z2KAAA8F+A76YBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYqVxiZM2eOGjVqJD8/P3Xo0EGbNm0qtm96erosFovbz759+8pdNAAAqDy8DiMrV67UmDFjNGHCBG3fvl2dO3dWz549lZ2dXeJ8+/fvV05OjvPn2muvLXfRAACg8vA6jLz88ssaMWKERo4cqaioKM2aNUsRERGaO3duifOFhYWpTp06zp+qVauWu2gAAFB5eBVG8vPz9dVXXykuLs6lPS4uTlu2bClx3vbt2ys8PFyxsbHauHGj95UCAIBKycebzkePHlVBQYFq167t0l67dm3l5uZ6nCc8PFxvvvmmOnToILvdriVLlig2Nlbp6enq0qWLx3nsdrvsdrtzOi8vT5LkcDjkcDi8KRnAVa5on7ZVMUyuxDsci4DSlXU/8SqMFLFYLC7ThmG4tRVp3ry5mjdv7pyOjo7W4cOHNXPmzGLDSFJSkqZOnerWvmHDBgUEBJSnZABXuWc7FppdgldSU1PNLgG46p05c6ZM/bwKI6GhoapatarbWZBff/3V7WxJSTp16qSlS5cW+3piYqISEhKc03l5eYqIiFBcXJyCgoK8KRnAVc7hcCgtLU0Ts6rIXuj5l5qr0e4p8WaXAFz1iq5slMarMOLr66sOHTooLS1Nd911l7M9LS1N/fr1K/Nytm/frvDw8GJft9lsstlsbu1Wq1VWq9WbkgFUEPZCi+wFFSeMcCwCSlfW/cTryzQJCQkaPHiwOnbsqOjoaL355pvKzs7WqFGjJJ0/q3HkyBEtXrxYkjRr1ixFRkaqVatWys/P19KlS5WSkqKUlBRvhwYAAJWQ12Hknnvu0bFjxzRt2jTl5OSodevWSk1NVcOGDSVJOTk5Ln9zJD8/X+PGjdORI0fk7++vVq1aaf369erVq9flexcAAKDCshiGcdXfwp6Xl6fg4GCdPHmSe0aASsbhcCg1NVVPfVm1Ql2mOTijt9klAFe9sv7/zXfTAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQQAAJiKMAIAAExFGAEAAKYijAAAAFMRRgAAgKkIIwAAwFSEEQAAYCrCCAAAMBVhBAAAmIowAgAATEUYAQAApiKMAAAAUxFGAACAqQgjAADAVIQRAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTlSuMzJkzR40aNZKfn586dOigTZs2ldg/IyNDHTp0kJ+fnxo3bqzk5ORyFQsAACofr8PIypUrNWbMGE2YMEHbt29X586d1bNnT2VnZ3vsf+DAAfXq1UudO3fW9u3bNX78eD3++ONKSUm55OIBAEDF53UYefnllzVixAiNHDlSUVFRmjVrliIiIjR37lyP/ZOTk9WgQQPNmjVLUVFRGjlypB588EHNnDnzkosHAAAVn1dhJD8/X1999ZXi4uJc2uPi4rRlyxaP82RmZrr1j4+PV1ZWlhwOh5flAgCAysbHm85Hjx5VQUGBateu7dJeu3Zt5ebmepwnNzfXY/9z587p6NGjCg8Pd5vHbrfLbrc7p0+ePClJOn78OAEGqGQcDofOnDkjH0cVFRRazC6nzI4dO2Z2CcBV79SpU5IkwzBK7OdVGClisbgeMAzDcGsrrb+n9iJJSUmaOnWqW3ujRo28LRUArojQl8yuAKg4Tp06peDg4GJf9yqMhIaGqmrVqm5nQX799Ve3sx9F6tSp47G/j4+PQkJCPM6TmJiohIQE53RhYaGOHz+ukJCQEkMPgIonLy9PEREROnz4sIKCgswuB8BlZBiGTp06pbp165bYz6sw4uvrqw4dOigtLU133XWXsz0tLU39+vXzOE90dLTWrl3r0rZhwwZ17NhRVqvV4zw2m002m82lrUaNGt6UCqCCCQoKIowAlVBJZ0SKeP00TUJCgubNm6cFCxZo7969Gjt2rLKzszVq1ChJ589qDBkyxNl/1KhROnTokBISErR3714tWLBA8+fP17hx47wdGgAAVEJe3zNyzz336NixY5o2bZpycnLUunVrpaamqmHDhpKknJwcl7850qhRI6Wmpmrs2LGaPXu26tatq1dffVX9+/e/fO8CAABUWBajtFtcAeAKstvtSkpKUmJiotvlWQD/HQgjAADAVHxRHgAAMBVhBAAAmIowAgAATEUYAXBZpKeny2Kx6MSJE2aXAqCCIYwAAABTEUYAAICpCCMAJEmRkZGaNWuWS1u7du00ZcoUSee/2HLevHm66667FBAQoGuvvVZr1qwpdnl//vmnevfurU6dOun48eM6ePCgLBaL3n//fXXr1k0BAQFq27atMjMzXeZLSUlRq1atZLPZFBkZqZde+v/fSPfaa6/puuuuc06vXr1aFotFs2fPdrbFx8crMTFRkjRlyhS1a9dOS5YsUWRkpIKDg3Xvvfc6v0kUwNWBMAKgzKZOnaqBAwfqm2++Ua9evXT//ffr+PHjbv1OnjypuLg45efn65NPPlGtWrWcr02YMEHjxo3Tjh071KxZMw0aNEjnzp2TJH311VcaOHCg7r33Xu3atUtTpkzRxIkTtWjRIklS165dtWfPHh09elSSlJGRodDQUGVkZEiSzp07py1btigmJsY53o8//qjVq1dr3bp1WrdunTIyMjRjxowrtYoAlANhBECZDRs2TIMGDVLTpk31j3/8Q6dPn9aXX37p0ueXX35RTEyMwsLCtH79egUGBrq8Pm7cOPXu3VvNmjXT1KlTdejQIf3www+SpJdfflmxsbGaOHGimjVrpmHDhunRRx/Viy++KElq3bq1QkJCnOEjPT1dTzzxhHN627ZtOnv2rG699VbneIWFhVq0aJFat26tzp07a/Dgwfrkk0+u2DoC4D3CCIAya9OmjfPfgYGBql69un799VeXPrfddpsaN26sd955R76+viUuIzw8XJKcy9i7d69uueUWl/633HKLvv/+exUUFMhisahLly5KT0/XiRMntGfPHo0aNUoFBQXau3ev0tPTdf3116tatWrO+SMjI1W9enWXMS+uGYC5CCMAJElVqlTRxd8O4XA4XKatVqvLtMViUWFhoUtb7969tWnTJn377bcex7lwGRaLRZKcyzAMw9lW5OKaunbtqvT0dG3atElt27ZVjRo11KVLF2VkZCg9PV1du3b1umYA5iKMAJAkXXPNNcrJyXFO5+Xl6cCBA14vZ8aMGRo6dKhiY2OLDSTFadmypTZv3uzStmXLFjVr1kxVq1aV9P/vG3nvvfecwSMmJkYff/yx2/0iACoGwggASVL37t21ZMkSbdq0Sbt379bQoUOdAcBbM2fO1P3336/u3btr3759ZZ7viSee0CeffKJnn31W3333nd566y29/vrrGjdunLNP0X0jy5Ytc4aRrl27avXq1frzzz9d7hcBUDH4mF0AgKtDYmKi/vOf/6hPnz4KDg7Ws88+W64zI0X++c9/qqCgQN27d1d6errH+0cudv311+udd97RpEmT9Oyzzyo8PFzTpk3TsGHDnH0sFotiYmK0evVqde7cWdL5+1CCg4PVuHFjBQUFlbtmAOawGBdfkAUAAPgLcZkGAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGAqwggAADAVYQRAmVkslhJ/LvxLqX+1yMhIzZo1y7TxAZQffw4eQJld+EV6K1eu1KRJk7R//35nm7+/v1fLy8/PL9OfiQdQuXFmBECZ1alTx/kTHBwsi8XinLZarRo1apTq16+vgIAAXXfddXr77bdd5u/ataseffRRJSQkKDQ0VD169JAkrVmzRtdee638/f3VrVs3vfXWW7JYLDpx4oRz3i1btqhLly7y9/dXRESEHn/8cZ0+fdq53EOHDmns2LHOszQAKg7CCIDL4uzZs+rQoYPWrVun3bt363/+5380ePBgffHFFy793nrrLfn4+Ojzzz/XG2+8oYMHD2rAgAG68847tWPHDj388MOaMGGCyzy7du1SfHy87r77bn3zzTdauXKlNm/erEcffVSS9P7776t+/fqaNm2acnJyXM7gALj68UV5AMpl0aJFGjNmjMvZi4v17t1bUVFRmjlzpqTzZzBOnjyp7du3O/v8/e9/1/r167Vr1y5n2zPPPKPp06fr999/V40aNTRkyBD5+/vrjTfecPbZvHmzYmJidPr0afn5+SkyMlJjxozRmDFjLvt7BXBlcc8IgMuioKBAM2bM0MqVK3XkyBHZ7XbZ7XYFBga69OvYsaPL9P79+3XDDTe4tN14440u01999ZV++OEHLVu2zNlmGIYKCwt14MABRUVFXeZ3A+CvRBgBcFm89NJL+uc//6lZs2bpuuuuU2BgoMaMGaP8/HyXfheHE8Mw3O7xuPiEbWFhoR5++GE9/vjjbuM2aNDgMr0DAGYhjAC4LDZt2qR+/frpgQcekHQ+QHz//felnrVo0aKFUlNTXdqysrJcpq+//nrt2bNHTZs2LXY5vr6+KigoKGf1AMzEDawALoumTZsqLS1NW7Zs0d69e/Xwww8rNze31Pkefvhh7du3T08//bS+++47vfPOO1q0aJEkOc+YPP3008rMzNTo0aO1Y8cOff/991qzZo0ee+wx53IiIyP12Wef6ciRIzp69OgVeY8ArgzCCIDLYuLEibr++usVHx+vrl27qk6dOrrzzjtLna9Ro0Z677339P7776tNmzaaO3eu82kam80mSWrTpo0yMjL0/fffq3Pnzmrfvr0mTpyo8PBw53KmTZumgwcPqkmTJrrmmmuuyHsEcGXwNA2Aq8706dOVnJysw4cPm10KgL8A94wAMN2cOXN0ww03KCQkRJ9//rlefPFF598QAVD5EUYAmO7777/Xc889p+PHj6tBgwZ64oknlJiYaHZZAP4iXKYBAACm4gZWAABgKsIIAAAwFWEEAACYijACAABMRRgBAACmIowAAABTEUYAAICpCCMAAMBUhBEAAGCq/wd9UcKr1s6aGAAAAABJRU5ErkJggg==", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8 users in cluster 0\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# After clustering, we would like to see what the replaced mode argmax distribution in each cluster is.\n", "\n", @@ -555,14 +310,14 @@ " ax.set_title(f\"Replaced mode argmax distribution for users in cluster {cix}\")\n", " ax.set_xlabel(\"Target\")\n", " \n", - " plt.savefig(f'./outputs/{CURRENT_DB}__FIG1_cluster_{cix}_target_dist.png', dpi=300)\n", + " plt.savefig(OUTPUT_DIR / f'{CURRENT_DB}__FIG1_cluster_{cix}_target_dist.png', dpi=300)\n", " \n", " plt.show()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "f2e8e117", "metadata": {}, "outputs": [], @@ -579,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "99369dba", "metadata": {}, "outputs": [], @@ -589,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "6cca3671", "metadata": {}, "outputs": [], @@ -610,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "18093734", "metadata": {}, "outputs": [], @@ -623,200 +378,10 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "8001a140", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pct_trips_unknownpct_trips_cardistance_unknowndistance_carduration_carduration_unknown
0600d3df-c1aa-4ca2-83f2-1f6b8931280d1.00.00.1669780.00.00.031289
44eda4da-9223-4bb0-afd4-e7dd19fc6b271.00.00.3763080.00.00.362037
4c5436e9-4840-4872-9e8f-5d46ba81fe521.00.00.0000000.00.00.000000
7479810c-c602-4508-8ae2-da0bed87558d1.00.00.8021320.00.00.447344
7f7c9d3b-84ed-4c14-be8a-aa256daaed011.00.00.2093200.00.00.172709
892088f9-4a27-4f39-91fb-0f5e48d189821.00.00.9825190.00.00.705049
993af3be-5011-44ad-b9cd-d4df7f0e67ad1.00.00.6593890.00.01.000000
c8158323-957d-43c7-bde6-193b99ee72b51.00.00.1004480.00.00.030035
cbed6b7b-555d-43a0-aadc-4a42540a024e1.00.00.3736100.00.00.228214
de83c290-7708-4f8b-8ca3-656072164ef60.01.00.5359491.01.00.700681
f3b93934-09ca-4b90-9089-51b5777bb9e71.00.01.0000000.00.00.740508
f8260067-8ba9-44ea-9c39-cd3e1bd003dd1.00.00.2326130.00.00.250613
\n", - "
" - ], - "text/plain": [ - " pct_trips_unknown pct_trips_car \\\n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 1.0 0.0 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 1.0 0.0 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 1.0 0.0 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 1.0 0.0 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 1.0 0.0 \n", - "892088f9-4a27-4f39-91fb-0f5e48d18982 1.0 0.0 \n", - "993af3be-5011-44ad-b9cd-d4df7f0e67ad 1.0 0.0 \n", - "c8158323-957d-43c7-bde6-193b99ee72b5 1.0 0.0 \n", - "cbed6b7b-555d-43a0-aadc-4a42540a024e 1.0 0.0 \n", - "de83c290-7708-4f8b-8ca3-656072164ef6 0.0 1.0 \n", - "f3b93934-09ca-4b90-9089-51b5777bb9e7 1.0 0.0 \n", - "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 1.0 0.0 \n", - "\n", - " distance_unknown distance_car \\\n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.166978 0.0 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.376308 0.0 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.000000 0.0 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.802132 0.0 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.209320 0.0 \n", - "892088f9-4a27-4f39-91fb-0f5e48d18982 0.982519 0.0 \n", - "993af3be-5011-44ad-b9cd-d4df7f0e67ad 0.659389 0.0 \n", - "c8158323-957d-43c7-bde6-193b99ee72b5 0.100448 0.0 \n", - "cbed6b7b-555d-43a0-aadc-4a42540a024e 0.373610 0.0 \n", - "de83c290-7708-4f8b-8ca3-656072164ef6 0.535949 1.0 \n", - "f3b93934-09ca-4b90-9089-51b5777bb9e7 1.000000 0.0 \n", - "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 0.232613 0.0 \n", - "\n", - " duration_car duration_unknown \n", - "0600d3df-c1aa-4ca2-83f2-1f6b8931280d 0.0 0.031289 \n", - "44eda4da-9223-4bb0-afd4-e7dd19fc6b27 0.0 0.362037 \n", - "4c5436e9-4840-4872-9e8f-5d46ba81fe52 0.0 0.000000 \n", - "7479810c-c602-4508-8ae2-da0bed87558d 0.0 0.447344 \n", - "7f7c9d3b-84ed-4c14-be8a-aa256daaed01 0.0 0.172709 \n", - "892088f9-4a27-4f39-91fb-0f5e48d18982 0.0 0.705049 \n", - "993af3be-5011-44ad-b9cd-d4df7f0e67ad 0.0 1.000000 \n", - "c8158323-957d-43c7-bde6-193b99ee72b5 0.0 0.030035 \n", - "cbed6b7b-555d-43a0-aadc-4a42540a024e 0.0 0.228214 \n", - "de83c290-7708-4f8b-8ca3-656072164ef6 1.0 0.700681 \n", - "f3b93934-09ca-4b90-9089-51b5777bb9e7 0.0 0.740508 \n", - "f8260067-8ba9-44ea-9c39-cd3e1bd003dd 0.0 0.250613 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "target_df = user_target_pct.merge(right=target_distance, left_index=True, right_index=True).merge(\n", " right=target_duration, left_index=True, right_index=True\n", @@ -843,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "31fecc00", "metadata": {}, "outputs": [], @@ -878,21 +443,10 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "e39b41ba", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({0: 11, -1: 1})" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# 0.35 is a good value\n", "\n", @@ -910,21 +464,10 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "1dbf8763", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "\n", @@ -933,31 +476,23 @@ "fig, ax = plt.subplots()\n", "sns.scatterplot(x=tsfm[:,0], y=tsfm[:,1], c=cl2.labels_)\n", "ax.set(xlabel='Latent Dim 0', ylabel='Latent Dim 1')\n", - "plt.savefig(f'./outputs/{CURRENT_DB}__Fig2__PCA_w_colors.png', dpi=300)\n", + "plt.savefig(OUTPUT_DIR / f'{CURRENT_DB}__Fig2__PCA_w_colors.png', dpi=300)\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "1e444316", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['duration', 'distance', 'start:hour', 'end:hour', 'user_id', 'target', 'section_mode_argmax', 'section_distance_argmax', 'section_duration_argmax', 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', 'income_category', 'available_modes', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', 'highest_education_high_school_graduate_or_ged', 'highest_education_prefer_not_to_say', 'highest_education_some_college_or_associates_degree', 'primary_job_description_Clerical or administrative support', 'primary_job_description_Other', 'gender_man', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', 'av_ridehail', 'av_p_micro', 'av_walk', 'av_transit', 'av_car', 'av_s_micro', 'av_s_car', 'av_unknown', 'av_no_trip', 'cost_ridehail', 'cost_p_micro', 'cost_walk', 'cost_transit', 'cost_car', 'cost_s_micro', 'cost_s_car', 'cost_unknown', 'cost_no_trip', 'mph']\n" - ] - } - ], + "outputs": [], "source": [ "print(df.columns.tolist())" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "f0bc09b9", "metadata": {}, "outputs": [], @@ -972,119 +507,87 @@ "\n", "\n", "demographic_cols = {\n", - " 'allceo': [\n", - " 'has_drivers_license', 'is_student', 'is_paid', 'income_category',\n", - " 'n_residence_members', 'n_residents_u18', 'n_residents_with_license',\n", - " 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs', 'n_working_residents',\n", - " \"highest_education_Bachelor's degree\",\n", - " 'highest_education_Graduate degree or professional degree',\n", - " 'highest_education_High school graduate or GED',\n", - " 'highest_education_Less than a high school graduate',\n", - " 'highest_education_Prefer not to say',\n", - " 'highest_education_Some college or associates degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Custodial',\n", - " 'primary_job_description_Education',\n", - " 'primary_job_description_Food service',\n", - " 'primary_job_description_Linecook',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Medical/healthcare',\n", - " 'primary_job_description_Non-profit program manager',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, managerial, or technical',\n", - " 'primary_job_description_Sales or service',\n", - " 'primary_job_description_Self employed',\n", - " 'primary_job_description_food service', 'gender_Man',\n", - " 'gender_Nonbinary/genderqueer/genderfluid', 'gender_Prefer not to say',\n", - " 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", - " 'age_16___20_years_old', 'age_21___25_years_old',\n", - " 'age_26___30_years_old', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_46___50_years_old', 'age_51___55_years_old',\n", - " 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old',\n", - " 'av_transit', 'av_no_trip', 'av_p_micro', 'av_s_micro', 'av_ridehail',\n", - " 'av_unknown', 'av_walk', 'av_car', 'av_s_car'\n", - " ],\n", - " 'durham': [\n", - " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18',\n", - " 'n_residence_members', 'income_category',\n", - " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles',\n", - " 'has_medical_condition', 'ft_job', 'multiple_jobs',\n", - " 'highest_education_bachelor_s_degree',\n", - " 'highest_education_graduate_degree_or_professional_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'primary_job_description_Sales or service', 'gender_man',\n", - " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman',\n", - " 'age_16___20_years_old', 'age_21___25_years_old',\n", - " 'age_26___30_years_old', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_51___55_years_old', 'age_56___60_years_old', 'av_walk',\n", - " 'av_unknown', 'av_no_trip', 'av_p_micro', 'av_transit', 'av_car',\n", - " 'av_ridehail', 'av_s_micro', 'av_s_car'\n", - " ],\n", - " 'nicr': [\n", - " 'is_student', 'is_paid',\n", - " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", - " 'income_category', 'n_residents_with_license',\n", - " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_prefer_not_to_say', 'primary_job_description_Other',\n", - " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'av_p_micro',\n", - " 'av_car', 'av_transit', 'av_ridehail', 'av_no_trip', 'av_s_car',\n", - " 'av_s_micro', 'av_unknown', 'av_walk'\n", - " ],\n", - " 'masscec': [\n", - " 'is_student', 'is_paid',\n", - " 'has_drivers_license', 'n_residents_u18', 'n_residence_members',\n", - " 'income_category', 'n_residents_with_license',\n", - " 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree',\n", - " 'highest_education_graduate_degree_or_professional_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_prefer_not_to_say',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Clerical or administrative support',\n", - " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Prefer not to say',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'primary_job_description_Sales or service', 'gender_man',\n", - " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old',\n", - " 'age_21___25_years_old', 'age_26___30_years_old',\n", - " 'age_31___35_years_old', 'age_36___40_years_old',\n", - " 'age_41___45_years_old', 'age_46___50_years_old',\n", - " 'age_51___55_years_old', 'age_56___60_years_old',\n", - " 'age_61___65_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car',\n", - " 'av_s_micro', 'av_transit', 'av_car', 'av_no_trip', 'av_unknown',\n", - " 'av_ridehail', 'av_walk'\n", - " ],\n", - " 'ride2own': [\n", - " 'has_drivers_license', 'is_student',\n", - " 'is_paid', 'income_category', 'n_residence_members',\n", - " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license',\n", - " 'n_motor_vehicles', 'has_medical_condition',\n", - " 'ft_job', 'multiple_jobs',\n", - " 'highest_education_bachelor_s_degree',\n", - " 'highest_education_high_school_graduate_or_ged',\n", - " 'highest_education_less_than_a_high_school_graduate',\n", - " 'highest_education_some_college_or_associates_degree',\n", - " 'primary_job_description_Other',\n", - " 'primary_job_description_Professional, Manegerial, or Technical',\n", - " 'gender_man', 'gender_woman', 'age_31___35_years_old',\n", - " 'age_36___40_years_old', 'age_41___45_years_old',\n", - " 'age_51___55_years_old', 'av_no_trip', 'av_s_micro', 'av_transit',\n", - " 'av_car', 'av_ridehail', 'av_p_micro', 'av_s_car', 'av_walk',\n", - " 'av_unknown'\n", - " ]\n", + " 'allceo': [ \n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", + " 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles',\n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'n_working_residents', \n", + " \"highest_education_Bachelor's degree\", 'highest_education_Graduate degree or professional degree', \n", + " 'highest_education_High school graduate or GED', 'highest_education_Less than a high school graduate', \n", + " 'highest_education_Prefer not to say', 'highest_education_Some college or associates degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', 'gender_Man', \n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid', 'gender_Nonbinary/genderqueer/genderfluid', \n", + " 'gender_Prefer not to say', 'gender_Test', 'gender_Woman', 'gender_Woman;Nonbinary/genderqueer/genderfluid', \n", + " 'age_16___20_years_old', 'age_1___5_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old', \n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'age_61___65_years_old', 'age___65_years_old', \n", + " 'av_s_car', 'av_walk', 'av_ridehail', 'av_s_micro', 'av_transit', 'av_no_trip', 'av_car', 'av_unknown', \n", + " 'av_p_micro'\n", + " ],\n", + " 'durham': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', 'income_category',\n", + " 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', 'has_medical_condition', 'ft_job',\n", + " 'multiple_jobs', 'highest_education_bachelor_s_degree', 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_less_than_a_high_school_graduate',\n", + " 'highest_education_some_college_or_associates_degree', 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', 'primary_job_description_Professional, Manegerial, or Technical',\n", + " 'primary_job_description_Sales or service', 'gender_man', 'gender_non_binary_genderqueer_gender_non_confor',\n", + " 'gender_prefer_not_to_say', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old',\n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', 'age_46___50_years_old',\n", + " 'age_51___55_years_old', 'age_56___60_years_old', 'av_unknown', 'av_no_trip', 'av_s_micro', 'av_s_car', 'av_car',\n", + " 'av_p_micro', 'av_walk', 'av_transit', 'av_ridehail'\n", + " ],\n", + " 'nicr': [\n", + "\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", + " 'income_category', 'n_residents_with_license', 'n_working_residents', 'n_motor_vehicles', \n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_high_school_graduate_or_ged', 'highest_education_prefer_not_to_say', \n", + " 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Other', \n", + " 'gender_man', 'gender_woman', 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'av_s_car', 'av_no_trip', 'av_s_micro', 'av_walk', 'av_unknown', 'av_p_micro', 'av_transit', 'av_car', \n", + " 'av_ridehail'\n", + " ],\n", + " 'masscec': [\n", + " 'is_student', 'is_paid', 'has_drivers_license', 'n_residents_u18', 'n_residence_members', \n", + " 'income_category', 'n_residents_with_license', 'n_working_residents', \n", + " 'n_motor_vehicles', 'has_medical_condition', 'ft_job', 'multiple_jobs', \n", + " 'highest_education_bachelor_s_degree', 'highest_education_graduate_degree_or_professional_degree',\n", + " 'highest_education_high_school_graduate_or_ged', \n", + " 'highest_education_less_than_a_high_school_graduate', 'highest_education_prefer_not_to_say', \n", + " 'highest_education_some_college_or_associates_degree', \n", + " 'primary_job_description_Clerical or administrative support', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Other', 'primary_job_description_Prefer not to say', \n", + " 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', \n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_prefer_not_to_say', 'gender_woman', \n", + " 'age_16___20_years_old', 'age_21___25_years_old', 'age_26___30_years_old', \n", + " 'age_31___35_years_old', 'age_36___40_years_old', 'age_41___45_years_old', \n", + " 'age_46___50_years_old', 'age_51___55_years_old', 'age_56___60_years_old', \n", + " 'age_61___65_years_old', 'age___65_years_old', 'av_no_trip', 'av_transit', \n", + " 'av_ridehail', 'av_walk', 'av_car', 'av_p_micro', 'av_unknown', 'av_s_micro', 'av_s_car'\n", + " ],\n", + " 'ride2own': [\n", + " 'has_drivers_license', 'is_student', 'is_paid', 'income_category', 'n_residence_members', \n", + " 'n_working_residents', 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles', \n", + " 'has_medical_condition', 'ft_job', 'multiple_jobs', 'highest_education_bachelor_s_degree', \n", + " 'highest_education_graduate_degree_or_professional_degree', \n", + " 'highest_education_high_school_graduate_or_ged', \n", + " 'highest_education_less_than_a_high_school_graduate', \n", + " 'highest_education_some_college_or_associates_degree', 'primary_job_description_Other', \n", + " 'primary_job_description_Professional, Manegerial, or Technical', \n", + " 'primary_job_description_Sales or service', 'gender_man', \n", + " 'gender_non_binary_genderqueer_gender_non_confor', 'gender_woman', 'age_16___20_years_old', \n", + " 'age_21___25_years_old', 'age_26___30_years_old', 'age_31___35_years_old', \n", + " 'age_36___40_years_old', 'age_41___45_years_old', 'age_51___55_years_old', \n", + " 'age_56___60_years_old', 'age___65_years_old', 'av_p_micro', 'av_s_car', 'av_car', \n", + " 'av_ridehail', 'av_walk', 'av_transit', 'av_no_trip', 'av_s_micro', 'av_unknown'\n", + " ]\n", "}\n", "\n", "\n", @@ -1095,191 +598,12 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "5a3c6355", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "For cluster -1:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featurech
0is_student-0.0
24av_s_micro-0.0
23av_s_car-0.0
22av_no_trip-0.0
\n", - "
" - ], - "text/plain": [ - " feature ch\n", - "0 is_student -0.0\n", - "24 av_s_micro -0.0\n", - "23 av_s_car -0.0\n", - "22 av_no_trip -0.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "For cluster 0:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featurech
0is_student-0.0
23av_s_car-0.0
22av_no_trip-0.0
21av_ridehail-0.0
\n", - "
" - ], - "text/plain": [ - " feature ch\n", - "0 is_student -0.0\n", - "23 av_s_car -0.0\n", - "22 av_no_trip -0.0\n", - "21 av_ridehail -0.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "### DEMOGRAPHICS\n", "\n", @@ -1354,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "580bbd86", "metadata": {}, "outputs": [], @@ -1388,208 +712,12 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "92ad2485", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "For cluster -1:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_35596/1105737326.py:49: RuntimeWarning: Mean of empty slice\n", - " out_cluster_homogeneity[cix][feature] = np.nanmean([in_cluster_homogeneity[x].get(feature, np.nan) for x in oix])\n" - ] - }, - { - "data": { - "text/plain": [ - "unknown 0.986577\n", - "car 0.013423\n", - "Name: target, dtype: float64" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featurech
0section_duration_argmax_mean_bicycling0.0
25duration_median0.0
24duration_mean0.0
23mph_median_walking0.0
\n", - "
" - ], - "text/plain": [ - " feature ch\n", - "0 section_duration_argmax_mean_bicycling 0.0\n", - "25 duration_median 0.0\n", - "24 duration_mean 0.0\n", - "23 mph_median_walking 0.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "==================================================\n", - "For cluster 0:\n" - ] - }, - { - "data": { - "text/plain": [ - "unknown 1.0\n", - "Name: target, dtype: float64" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featurech
31duration_median250175.825182
1section_duration_argmax_mean_car262552.009065
11section_distance_argmax_mean_car263103.437553
24mph_mean_walking264091.869648
\n", - "
" - ], - "text/plain": [ - " feature ch\n", - "31 duration_median 250175.825182\n", - "1 section_duration_argmax_mean_car 262552.009065\n", - "11 section_distance_argmax_mean_car 263103.437553\n", - "24 mph_mean_walking 264091.869648" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "==================================================\n" - ] - } - ], + "outputs": [], "source": [ "## TRIP SUMMARIES\n", "\n", @@ -1679,63 +807,12 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "a8723e3d", "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "For cluster -1:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_35596/2042025115.py:34: RuntimeWarning: Mean of empty slice\n", - " oc[cix][feature] = np.nanmean([ic[x].get(feature, np.nan) for x in oix])\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n", - "For cluster 0:\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================================================\n" - ] - } - ], + "outputs": [], "source": [ "ic, oc = dict(), dict()\n", "\n", @@ -1835,7 +912,7 @@ " offset_col_ix += 1\n", " \n", " plt.tight_layout()\n", - " plt.savefig(f\"./outputs/{CURRENT_DB}_cluster{cix}_combined_features.png\", dpi=300)\n", + " plt.savefig(OUTPUT_DIR / f\"{CURRENT_DB}_cluster{cix}_combined_features.png\", dpi=300)\n", " plt.show()\n", " print(50 * '=')" ] @@ -1850,7 +927,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "d0288db8", "metadata": {}, "outputs": [], @@ -1863,7 +940,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "1b14ad0c", "metadata": {}, "outputs": [], @@ -1876,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "2562bbb6-66eb-4283-8c08-6e20a0b2ade5", "metadata": {}, "outputs": [], @@ -1887,7 +964,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "c7aad38a", "metadata": {}, "outputs": [], @@ -1902,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "39ce0238-b3f2-4f46-a52f-13e3160cc52f", "metadata": {}, "outputs": [], @@ -1934,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "ec27cf29", "metadata": { "scrolled": false diff --git a/replacement_mode_modeling/05_biogeme_modeling.ipynb b/replacement_mode_modeling/05_biogeme_modeling.ipynb new file mode 100644 index 0000000..29e89b7 --- /dev/null +++ b/replacement_mode_modeling/05_biogeme_modeling.ipynb @@ -0,0 +1,929 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install biogeme: `pip3 install biogeme==3.2.12`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from enum import Enum\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import pandas as pd\n", + "import biogeme.biogeme as bio\n", + "import biogeme.database as db\n", + "from biogeme import models\n", + "from biogeme.expressions import Beta, DefineVariable\n", + "from biogeme.expressions import Variable\n", + "import numpy as np\n", + "import seaborn as sns\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", + "from sklearn.metrics import f1_score, r2_score, ConfusionMatrixDisplay\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Global experiment flags and variables.\n", + "SEED = 19348\n", + "TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']\n", + "\n", + "# Set the Numpy seed too.\n", + "np.random.seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SPLIT_TYPE(Enum):\n", + " INTRA_USER = 0\n", + " TARGET = 1\n", + " MODE = 2\n", + " \n", + "\n", + "class SPLIT(Enum):\n", + " TRAIN = 0\n", + " TEST = 1\n", + "\n", + "\n", + "def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):\n", + "\n", + " if how == SPLIT_TYPE.INTRA_USER:\n", + " \n", + " # There are certain users with only one observation. What do we do with those?\n", + " # As per the mobilitynet modeling pipeline, we randomly assign them to either the\n", + " # training or test set.\n", + " \n", + " value_counts = data.user_id.value_counts()\n", + " single_count_ids = value_counts[value_counts == 1].index\n", + " \n", + " data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,\n", + " random_state=SEED\n", + " )\n", + " \n", + " data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))\n", + " X_tr_merged = pd.concat(\n", + " [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " X_te_merged = pd.concat(\n", + " [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " return X_tr_merged, X_te_merged\n", + " \n", + " elif how == SPLIT_TYPE.TARGET:\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.target,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.MODE:\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " raise NotImplementedError(\"Unknown split type\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following are common features across all datasets:\n", + "\n", + "```\n", + "{'age_21___25_years_old', 'cost_unknown', 'start_local_dt_hour', 'av_walk', 'distance', 'duration', 'av_unknown', 'ft_job', 'end_local_dt_hour', 'cost_no_trip', 'cost_s_micro', 'mph', 'n_residents_u18', 'is_paid', 'n_motor_vehicles', 'target', 'n_working_residents', 'section_distance_argmax', 'n_residence_members', 'has_medical_condition', 'primary_job_description_Other', 'cost_walk', 'cost_p_micro', 'av_transit', 'age_16___20_years_old', 'income_category', 'av_s_car', 'av_no_trip', 'cost_s_car', 'multiple_jobs', 'n_residents_with_license', 'section_duration_argmax', 'age_26___30_years_old', 'cost_car', 'av_p_micro', 'av_ridehail', 'av_car', 'cost_transit', 'available_modes', 'av_s_micro', 'has_drivers_license', 'cost_ridehail', 'user_id', 'section_mode_argmax', 'is_student'}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the data.\n", + "\n", + "DATA_SOURCES = [\n", + " ('../data/filtered_data/preprocessed_data_Stage_database.csv', 'allceo'),\n", + " ('../data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv', 'nicr'),\n", + " ('../data/filtered_data/preprocessed_data_openpath_prod_durham.csv', 'durham')\n", + "]\n", + "\n", + "DB_IX = 2\n", + "\n", + "PATH = DATA_SOURCES[DB_IX][0]\n", + "CURRENT_DB = DATA_SOURCES[DB_IX][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def norm_data(df: pd.DataFrame, split: SPLIT, scaler=None):\n", + " \n", + " columns = df.columns.tolist()\n", + " \n", + " # Ignore dummy features (1/0).\n", + " ignore_cols = [\n", + " c for c in columns if 'age_' in c or 'av_' in c or 'gender_' in c \n", + " or 'primary_job_description' in c or 'is_' in c or 'highest_education' in c\n", + " or '_job' in c or 'has_' in c\n", + " ] + ['user_id', 'target', 'section_mode_argmax']\n", + " \n", + " data = df.loc[:, [c for c in df.columns if c not in ignore_cols]]\n", + " ignored = df.loc[:, ignore_cols]\n", + " \n", + " if split == SPLIT.TRAIN:\n", + " \n", + " scaler = StandardScaler()\n", + " \n", + " scaled = pd.DataFrame(\n", + " scaler.fit_transform(data), \n", + " columns=data.columns, \n", + " index=data.index\n", + " )\n", + " \n", + " elif split == SPLIT.TEST:\n", + " scaled = pd.DataFrame(\n", + " scaler.transform(data), \n", + " columns=data.columns, \n", + " index=data.index\n", + " )\n", + " \n", + " else:\n", + " raise NotImplementedError(\"Unknown split\")\n", + " \n", + " return pd.concat([scaled, ignored], axis=1), scaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def drop_columns(df: pd.DataFrame):\n", + " \n", + " to_drop = [\n", + " 'available_modes'\n", + " ]\n", + " \n", + " for col in to_drop:\n", + " if col in df.columns:\n", + " df.drop(columns=[col], inplace=True)\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict):\n", + " \n", + " X_features = ['section_distance_argmax', 'mph']\n", + " \n", + " if dset == SPLIT.TRAIN and model_dict is None:\n", + " model_dict = dict()\n", + " \n", + " if dset == SPLIT.TEST and model_dict is None:\n", + " raise AttributeError(\"Expected model dict for testing.\")\n", + " \n", + " if dset == SPLIT.TRAIN:\n", + " for section_mode in df.section_mode_argmax.unique():\n", + " section_data = df.loc[df.section_mode_argmax == section_mode, :]\n", + " if section_mode not in model_dict:\n", + " model_dict[section_mode] = dict()\n", + "\n", + " model = LinearRegression(fit_intercept=True)\n", + "\n", + " X = section_data[\n", + " X_features\n", + " ]\n", + " Y = section_data[['section_duration_argmax']]\n", + "\n", + " model.fit(X, Y.values.ravel())\n", + "\n", + " r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel())\n", + " print(f\"Train R2 for {section_mode}: {r2}\")\n", + "\n", + " model_dict[section_mode]['model'] = model\n", + " \n", + " elif dset == SPLIT.TEST:\n", + " for section_mode in df.section_mode_argmax.unique():\n", + " \n", + " section_data = df.loc[df.section_mode_argmax == section_mode, :]\n", + " \n", + " X = section_data[\n", + " X_features\n", + " ]\n", + " Y = section_data[['section_duration_argmax']]\n", + " \n", + " if section_mode not in model_dict:\n", + " y_pred = [np.nan for _ in range(len(X))]\n", + " else:\n", + " y_pred = model_dict[section_mode]['model'].predict(X)\n", + " \n", + " r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel())\n", + " print(f\"Test R2 for {section_mode}: {r2}\")\n", + " \n", + " # Create the new columns for the duration.\n", + " new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown']\n", + " df[new_columns] = 0\n", + " df['temp'] = 0\n", + " \n", + " for section in df.section_mode_argmax.unique():\n", + " X_section = df.loc[df.section_mode_argmax == section, X_features]\n", + " \n", + " # broadcast to all columns.\n", + " df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section)\n", + " \n", + " for c in new_columns:\n", + " df[c] = df['av_' + c] * df['temp']\n", + " \n", + " df.drop(columns=['temp'], inplace=True)\n", + " \n", + " df.rename(columns=dict([(x, 'tt_'+x) for x in new_columns]), inplace=True)\n", + " \n", + " # return model_dict, result_df\n", + " return model_dict, df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, we drop columns, split the data, and normalize\n", + "\n", + "data = drop_columns(data)\n", + "\n", + "train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTRA_USER, shuffle=True)\n", + "\n", + "train_data, scaler = norm_data(train_data, split=SPLIT.TRAIN)\n", + "test_data, _ = norm_data(test_data, SPLIT.TEST, scaler)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "USERS = list(data.user_id.unique())\n", + "\n", + "USER_MAP = {\n", + " u: i+1 for (i, u) in enumerate(USERS)\n", + "}\n", + "\n", + "train_data['user_id'] = train_data['user_id'].apply(lambda x: USER_MAP[x])\n", + "test_data['user_id'] = test_data['user_id'].apply(lambda x: USER_MAP[x])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 7))\n", + "train_data.target.hist(ax=ax[0])\n", + "test_data.target.hist(ax=ax[1])\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params, train_data = get_duration_estimate(train_data, SPLIT.TRAIN, None)\n", + "print(10 * \"-\")\n", + "_, test_data = get_duration_estimate(test_data, SPLIT.TEST, params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop section_mode\n", + "\n", + "train_data.drop(columns=['section_mode_argmax'], inplace=True)\n", + "# test_data.drop(columns=['section_mode_argmax'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.shape, test_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(train_data.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Some helper functions that will help ease redundancy in the code.\n", + "\n", + "def get_database(df: pd.DataFrame, split: SPLIT):\n", + " return db.Database(split.name + '_db', df)\n", + "\n", + "\n", + "def get_variables(database: db.Database):\n", + " \n", + " columns = database.data\n", + " \n", + " # User-level features.\n", + " START_HOUR = Variable('start_local_dt_hour')\n", + " END_HOUR = Variable('end_local_dt_hour')\n", + " TRIP_DISTANCE = Variable('distance')\n", + " INCOME = Variable('income_category')\n", + " N_MEMBERS = Variable('n_residence_members')\n", + " N_U18 = Variable('n_residents_u18')\n", + " N_LICENSE = Variable('n_residents_with_license')\n", + " N_VEHICLES = Variable('n_motor_vehicles')\n", + " LICENSE = Variable('has_drivers_license')\n", + " CONDITION = Variable('has_medical_condition')\n", + " FT_JOB = Variable('ft_job')\n", + " MULTIPLE_JOBS = Variable('multiple_jobs')\n", + " \n", + " # Sections\n", + " DISTANCE_ARGMAX = Variable('section_distance_argmax')\n", + " TT_ARGMAX = Variable('section_duration_argmax')\n", + " MPH = Variable('mph')\n", + " \n", + " # Costs\n", + " COST_P_MICRO = Variable('cost_p_micro')\n", + " COST_NO_TRIP = Variable('cost_no_trip')\n", + " COST_S_CAR = Variable('cost_s_car')\n", + " COST_CAR = Variable('cost_car')\n", + " COST_S_MICRO = Variable('cost_s_micro')\n", + " COST_RIDEHAIL = Variable('cost_ridehail')\n", + " COST_WALK = Variable('cost_walk')\n", + " COST_UNKNOWN = Variable('cost_unknown')\n", + " COST_TRANSIT = Variable('cost_transit')\n", + "\n", + " # Availability.\n", + " AV_P_MICRO = Variable('av_p_micro')\n", + " AV_NO_TRIP = Variable('av_no_trip')\n", + " AV_S_CAR = Variable('av_s_car')\n", + " AV_TRANSIT = Variable('av_transit')\n", + " AV_CAR = Variable('av_car')\n", + " AV_S_MICRO = Variable('av_s_micro')\n", + " AV_RIDEHAIL = Variable('av_ridehail')\n", + " AV_WALK = Variable('av_walk')\n", + " AV_UNKNOWN = Variable('av_unknown')\n", + " \n", + " # OHE\n", + " G = [Variable(x) for x in columns if 'gender_' in x]\n", + " E = [Variable(x) for x in columns if 'highest_education' in x]\n", + " PJ = [Variable(x) for x in columns if 'primary_job_description' in x]\n", + " \n", + " # Times.\n", + " TT_P_MICRO = Variable('tt_p_micro')\n", + " TT_NO_TRIP = Variable('tt_no_trip')\n", + " TT_S_CAR = Variable('tt_s_car')\n", + " TT_TRANSIT = Variable('tt_transit')\n", + " TT_CAR = Variable('tt_car')\n", + " TT_S_MICRO = Variable('tt_s_micro')\n", + " TT_RIDEHAIL = Variable('tt_ridehail')\n", + " TT_WALK = Variable('tt_walk')\n", + " TT_UNKNOWN = Variable('tt_unknown')\n", + " \n", + " # Choice.\n", + " CHOICE = Variable('target')\n", + " \n", + " return_dict = locals().copy()\n", + " \n", + " # Remove the gender list and place them in the locals dict.\n", + " for i, val in enumerate(G):\n", + " return_dict.update({'G_' + str(i): val})\n", + " \n", + " del return_dict['G']\n", + " \n", + " \n", + " ## Education\n", + " for i, val in enumerate(E):\n", + " return_dict.update({'E_' + str(i): val})\n", + " \n", + " del return_dict['E']\n", + " \n", + " ## Job\n", + " for i, val in enumerate(PJ):\n", + " return_dict.update({'PJ_' + str(i): val})\n", + " \n", + " del return_dict['PJ']\n", + " \n", + " # return the filtered locals() dictionary.\n", + " return {k:v for k,v in return_dict.items() if not k.startswith('_') and k not in ['database', 'columns']}\n", + "\n", + "\n", + "# def exclude_from_db(v_dict: dict, db: db.Database):\n", + "# EXCLUDE = (v_dict['CHOICE'] == 2) + (v_dict['CHOICE'] == 9) > 0\n", + "# db.remove(EXCLUDE)\n", + "\n", + "def get_params(variables):\n", + " \n", + " param_dict = {'B_' + k: Beta('B_' + k, 0, None, None, 0) for k in variables.keys()}\n", + " \n", + " param_dict['ASC_P_MICRO'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_NO_TRIP'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_S_CAR'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_TRANSIT'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_CAR'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_S_MICRO'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_RIDEHAIL'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_WALK'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " param_dict['ASC_UNKNOWN'] = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " \n", + " # Return filtered locals dict.\n", + " return param_dict\n", + "\n", + "\n", + "def get_utility_functions(v: dict):\n", + " \n", + " ## User-level utility.\n", + " user = 1.\n", + " for var in [\n", + " 'INCOME', 'N_MEMBERS', \n", + " 'N_U18', 'N_LICENSE', 'N_VEHICLES', 'LICENSE', 'CONDITION', 'FT_JOB', 'MULTIPLE_JOBS'\n", + " ]:\n", + " user += v[var] * v['B_'+var]\n", + " \n", + " # OHE (One-hot encoded utility.)\n", + " ohe = 1.\n", + " ohe_vars = [var for var in v if ('G_' in var or 'E_' in var or 'PJ_' in var) and 'B_' not in var]\n", + " for var in ohe_vars:\n", + " ohe += v[var] * v['B_'+var]\n", + " \n", + " ## Trip utility.\n", + " trip = 1.\n", + " for var in ['MPH', 'DISTANCE_ARGMAX', 'TT_ARGMAX', 'START_HOUR', 'END_HOUR', 'TRIP_DISTANCE']:\n", + " trip += v[var] * v['B_' + var]\n", + " \n", + " \n", + " V_P_MICRO = v['ASC_P_MICRO'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_P_MICRO'] * v['B_TT_P_MICRO'] + \\\n", + " v['COST_P_MICRO'] * v['B_COST_P_MICRO']\n", + " \n", + " V_S_MICRO = v['ASC_S_MICRO'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_S_MICRO'] * v['B_TT_S_MICRO'] + \\\n", + " v['COST_S_MICRO'] * v['B_COST_S_MICRO']\n", + " \n", + " V_S_CAR = v['ASC_S_CAR'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_S_CAR'] * v['B_TT_S_CAR'] + \\\n", + " v['COST_S_CAR'] * v['B_COST_S_CAR']\n", + " \n", + " V_CAR = v['ASC_CAR'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_CAR'] * v['B_TT_CAR'] + \\\n", + " v['COST_CAR'] * v['B_COST_CAR']\n", + " \n", + " V_TRANSIT = v['ASC_TRANSIT'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_TRANSIT'] * v['B_TT_TRANSIT'] + \\\n", + " v['COST_TRANSIT'] * v['B_COST_TRANSIT']\n", + " \n", + " V_WALK = v['ASC_WALK'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_WALK'] * v['B_TT_WALK'] + \\\n", + " v['COST_WALK'] * v['B_COST_WALK']\n", + " \n", + " V_RIDEHAIL = v['ASC_RIDEHAIL'] + \\\n", + " ohe + user + trip + \\\n", + " v['TT_RIDEHAIL'] * v['B_TT_RIDEHAIL'] + \\\n", + " v['COST_RIDEHAIL'] * v['B_COST_RIDEHAIL']\n", + " \n", + " V_NO_TRIP = -100\n", + " V_UNKNOWN = -100\n", + " \n", + " # Remember to exclude the input argument.\n", + " return {k:v for k,v in locals().items() if not k.startswith('_') and k != 'v'}\n", + "\n", + "\n", + "def get_utility_mapping(var: dict):\n", + " # Map alterative to utility functions.\n", + " return {\n", + " 1: var['V_P_MICRO'], \n", + " 2: var['V_NO_TRIP'],\n", + " 3: var['V_S_CAR'], \n", + " 4: var['V_TRANSIT'],\n", + " 5: var['V_CAR'], \n", + " 6: var['V_S_MICRO'],\n", + " 7: var['V_RIDEHAIL'], \n", + " 8: var['V_WALK'], \n", + " 9: var['V_UNKNOWN']\n", + " }\n", + "\n", + "\n", + "def get_availability_mapping(var: dict):\n", + " return {\n", + " 1: var['AV_P_MICRO'],\n", + " 2: var['AV_NO_TRIP'],\n", + " 3: var['AV_S_CAR'],\n", + " 4: var['AV_TRANSIT'],\n", + " 5: var['AV_CAR'],\n", + " 6: var['AV_S_MICRO'],\n", + " 7: var['AV_RIDEHAIL'],\n", + " 8: var['AV_WALK'],\n", + " 9: var['AV_UNKNOWN']\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # First, drop columns.\n", + "\n", + "# train_data = drop_columns(train_data)\n", + "\n", + "# train_data, scaler = norm_data(train_data, split=SPLIT.TRAIN)\n", + "\n", + "# get dbs.\n", + "train_db = get_database(train_data, SPLIT.TRAIN)\n", + "\n", + "# get vars.\n", + "train_vars = get_variables(train_db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_vars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_params = get_params(train_vars)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_vars.update(train_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_V = get_utility_functions(train_vars)\n", + "train_vars.update(train_V)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "V = get_utility_mapping(train_vars)\n", + "av = get_availability_mapping(train_vars)\n", + "logprob = models.loglogit(V, av, train_vars['CHOICE'])\n", + "\n", + "# logit1 = models.logit(V, av, 1)\n", + "# logit2 = models.logit(V, av, 2)\n", + "# logit3 = models.logit(V, av, 3)\n", + "# logit4 = models.logit(V, av, 4)\n", + "# logit5 = models.logit(V, av, 5)\n", + "# logit6 = models.logit(V, av, 6)\n", + "# logit7 = models.logit(V, av, 7)\n", + "# logit8 = models.logit(V, av, 8)\n", + "# logit9 = models.logit(V, av, 9)\n", + "\n", + "# models = {f'logit_{ix}': logit for ix, logit in enumerate(\n", + "# [logit1, logit2, logit3, logit4, logit5, logit6, logit7, logit8, logit9]\n", + "# )}\n", + "\n", + "model = bio.BIOGEME(train_db, logprob)\n", + "model.modelName = 'customUtility-new'\n", + "model.generate_html = False\n", + "model.generate_pickle = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_results = model.estimate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(train_results.short_summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(train_results.getEstimatedParameters())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from biogeme.expressions import Derive\n", + "\n", + "\n", + "def simulate_results(V, av, db, beta_dict):\n", + " \n", + " wtp = {\n", + " 'WTP s_car': Derive(V[3], 'tt_s_car')/Derive(V[3], 'scaled_cost_s_car'),\n", + " 'WTP transit': Derive(V[4], 'tt_transit')/Derive(V[4], 'scaled_cost_transit'),\n", + " 'WTP car': Derive(V[5], 'tt_car')/Derive(V[5], 'scaled_cost_car'),\n", + " 'WTP s_micro': Derive(V[6], 'tt_s_micro')/Derive(V[6], 'scaled_cost_s_micro'),\n", + " 'WTP ridehail': Derive(V[7], 'tt_ridehail')/Derive(V[7], 'scaled_cost_ridehail')\n", + " }\n", + " \n", + " prob_labels = ['Prob. ' + x for x in TARGETS]\n", + " probs = [models.logit(V, av, i+1) for i in range(len(prob_labels))]\n", + " \n", + " simulate = dict(zip(prob_labels, probs))\n", + " \n", + " # simulate.update(wtp)\n", + " \n", + " biosim = bio.BIOGEME(db, simulate)\n", + " biosim.modelName = 'test-3'\n", + " \n", + " return biosim.simulate(theBetaValues=beta_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = drop_columns(test_data)\n", + "\n", + "# Scale cost.\n", + "test_data, _ = norm_data(test_data, SPLIT.TEST, scaler)\n", + "\n", + "test_data.drop(columns=['section_mode_argmax'], inplace=True)\n", + "\n", + "# get dbs.\n", + "test_db = get_database(test_data, SPLIT.TEST)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_probs = simulate_results(V, av, test_db, train_results.getBetaValues())\n", + "# test_utilities = get_utility_df(train_results, test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(test_probs.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# argmax starts from 0. Offset all predicted indices by 1.\n", + "choices = np.argmax(test_probs.values, axis=1) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_true = test_data.chosen\n", + "score = f1_score(y_true, choices, average='weighted')\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "counts = pd.Series(choices).value_counts()\n", + "ix = counts.index.tolist()\n", + "_x = [i+1 for i in range(len(TARGETS))]\n", + "height = [0 if i not in ix else counts[i] for i in _x]\n", + "ax.bar(x=_x, height=height)\n", + "ax.set_xticks(range(1, 10, 1))\n", + "ax.set_xticklabels(TARGETS, rotation=45)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "\n", + "fig, ax = plt.subplots()\n", + "cm = ConfusionMatrixDisplay.from_predictions(y_true=y_true, y_pred=choices, ax=ax)\n", + "\n", + "y_unique = np.unique(y_true)\n", + "labelset = [t for i, t in enumerate(TARGETS) if (i+1) in y_unique]\n", + "\n", + "ax.set_xticklabels(labelset, rotation=45)\n", + "ax.set_yticklabels(labelset)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# np.diag(cm.confusion_matrix)/np.sum(cm.confusion_matrix, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# u_np = test_utilities.values\n", + "# choice_df = np.exp(u_np)/np.sum(np.exp(u_np), axis=1, keepdims=True)\n", + "\n", + "# choice_df = pd.DataFrame(choice_df, columns=test_utilities.columns)\n", + "# display(choice_df.head())" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ab0c6e94c9422d07d42069ec9e3bb23090f5e156fc0e23cc25ca45a62375bf53" + }, + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9ee6fd40f45ec0b33ba96bccbb65b82a483b3ee2 Mon Sep 17 00:00:00 2001 From: Rahul Kulhalli Date: Wed, 1 May 2024 16:08:10 -0400 Subject: [PATCH 6/6] Adding all other experimental notebooks --- .../experimental_notebooks/LSTM.ipynb | 1398 +++++++++++++++++ .../experimental_notebooks/README.md | 3 + .../baseline_modeling0.ipynb | 1011 ++++++++++++ ...biogeme_modeling train_test_w_splits.ipynb | 1107 +++++++++++++ .../optimal_interuser_splits.ipynb | 617 ++++++++ .../rf_bayesian_optim.py | 280 ++++ 6 files changed, 4416 insertions(+) create mode 100644 replacement_mode_modeling/experimental_notebooks/LSTM.ipynb create mode 100644 replacement_mode_modeling/experimental_notebooks/README.md create mode 100644 replacement_mode_modeling/experimental_notebooks/baseline_modeling0.ipynb create mode 100644 replacement_mode_modeling/experimental_notebooks/biogeme_modeling train_test_w_splits.ipynb create mode 100644 replacement_mode_modeling/experimental_notebooks/optimal_interuser_splits.ipynb create mode 100644 replacement_mode_modeling/experimental_notebooks/rf_bayesian_optim.py diff --git a/replacement_mode_modeling/experimental_notebooks/LSTM.ipynb b/replacement_mode_modeling/experimental_notebooks/LSTM.ipynb new file mode 100644 index 0000000..80260d7 --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/LSTM.ipynb @@ -0,0 +1,1398 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5f2cdb77", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "from abc import ABC, abstractmethod\n", + "from typing import List\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ebc3879", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import random\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "import numpy as np\n", + "import pandas as pd\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from enum import Enum\n", + "import matplotlib.pyplot as plt\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import r2_score\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2ace37f", + "metadata": {}, + "outputs": [], + "source": [ + "# Global experiment flags and variables.\n", + "SEED = 13210\n", + "\n", + "'''\n", + "'No Travel', 'Free Shuttle', 'Other', 'Gas Car, drove alone',\n", + " 'Regular Bike', 'Walk', 'Gas Car, with others', 'Bus', 'E-bike',\n", + " 'Scooter share', 'Taxi/Uber/Lyft', 'Train', 'Bikeshare',\n", + " 'Skate board', 'Not a Trip'\n", + "'''\n", + "\n", + "TARGET_MAPPING = {\n", + " 'No Travel': 'no_trip',\n", + " 'Free Shuttle': 'transit',\n", + " 'Other': 'unknown',\n", + " 'Gas Car, drove alone': 'car',\n", + " 'Regular Bike': 'p_micro',\n", + " 'Walk': 'walk',\n", + " 'Gas Car, with others': 's_micro',\n", + " 'Bus': 'transit',\n", + " 'E-bike': 'p_micro',\n", + " 'Scooter share': 's_micro',\n", + " 'Taxi/Uber/Lyft': 'ridehail',\n", + " 'Train': 'transit',\n", + " 'Bikeshare': 's_micro',\n", + " 'Skate board': 'p_micro',\n", + " 'Not a Trip': 'no_trip'\n", + "}\n", + "\n", + "\n", + "TARGETS = {\n", + " x: ix for (ix, x) in enumerate([\n", + " 'p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown'\n", + " ])\n", + "}\n", + "\n", + "av_modes = {\n", + " 'Skateboard': 'p_micro', \n", + " 'Walk/roll': 'walk', \n", + " 'Shared bicycle or scooter': 's_micro', \n", + " 'Taxi (regular taxi, Uber, Lyft, etc)': 'ridehail', \n", + " 'Rental car (including Zipcar/ Car2Go)': 'car',\n", + " 'Bicycle': 'p_micro', \n", + " 'Public transportation (bus, subway, light rail, etc.)': 'transit',\n", + " 'Get a ride from a friend or family member': 's_car',\n", + " 'None': 'no_trip', \n", + " 'Prefer not to say': 'unknown'\n", + "}\n", + "\n", + "# Set the Numpy seed too.\n", + "random.seed(SEED)\n", + "np.random.seed(SEED)\n", + "torch.manual_seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9addd580", + "metadata": {}, + "outputs": [], + "source": [ + "TARGETS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "481cc1bf", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('../data/final_modeling_data_02142024.csv')\n", + "weather_df = pd.read_csv('../data/denver_weather_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8263d9ef", + "metadata": {}, + "outputs": [], + "source": [ + "data.Replaced_mode = data.Replaced_mode.replace(TARGET_MAPPING)\n", + "data.Replaced_mode = data.Replaced_mode.replace(TARGETS)\n", + "data.rename(columns={'Replaced_mode': 'target'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8954515f", + "metadata": {}, + "outputs": [], + "source": [ + "data[list(av_modes.values())] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf9b787b", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_availability(x):\n", + " modes = [y.strip() for y in x.available_modes.split(';')]\n", + " mapped = set([av_modes[x] for x in modes])\n", + " \n", + " for mode in mapped:\n", + " x[mode] = 1\n", + " \n", + " return x\n", + "\n", + "\n", + "data = data.apply(lambda x: encode_availability(x), axis=1)\n", + "data.drop(columns=['available_modes'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b010c95f", + "metadata": {}, + "outputs": [], + "source": [ + "data['mark'] = 0\n", + "\n", + "data.section_distances = data.section_distances.apply(lambda x: ast.literal_eval(x))\n", + "data.section_modes = data.section_modes.apply(lambda x: ast.literal_eval(x))\n", + "data.section_durations = data.section_durations.apply(lambda x: ast.literal_eval(x))\n", + "\n", + "data.mark = data.apply(\n", + " lambda x: 1 if (len(x.section_distances) == len(x.section_modes) == len(x.section_durations))\n", + " and len(x.section_distances) > 0 and len(x.section_modes) > 0 and len(x.section_durations) > 0 else 0,\n", + " axis=1\n", + ")\n", + "\n", + "data.section_distances = data.section_distances.apply(lambda x: np.array(x).astype(np.float64))\n", + "data.section_modes = data.section_modes.apply(lambda x: np.array(x))\n", + "data.section_durations = data.section_durations.apply(lambda x: np.array(x).astype(np.float64))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c79cdb8", + "metadata": {}, + "outputs": [], + "source": [ + "data = data.loc[data.mark == 1, :].drop(columns=['mark'], inplace=False).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c420ee08", + "metadata": {}, + "outputs": [], + "source": [ + "class SectionScaler:\n", + " def __init__(self):\n", + " self.dur = dict()\n", + " self.dist = dict()\n", + " \n", + " def compute_stats(self, df):\n", + " \n", + " for _, row in df[['section_modes', 'section_distances', 'section_durations']].iterrows():\n", + " for (mode, distance, duration) in zip(\n", + " row['section_modes'], row['section_distances'], row['section_durations']\n", + " ):\n", + " if mode not in self.dur.keys():\n", + " self.dur[mode] = [duration]\n", + " else:\n", + " self.dur[mode].append(duration)\n", + " \n", + " if mode not in self.dist.keys():\n", + " self.dist[mode] = [distance]\n", + " else:\n", + " self.dist[mode].append(distance)\n", + "\n", + " for mode in self.dur.keys():\n", + " self.dur[mode] = [np.nanmean(self.dur[mode]), np.std(self.dur[mode])]\n", + " \n", + " for mode in self.dist.keys():\n", + " self.dist[mode] = [np.nanmean(self.dist[mode]), np.std(self.dist[mode])]\n", + " \n", + " def apply(self, df):\n", + "\n", + " rows = list()\n", + " \n", + " for ix, x in df.iterrows():\n", + " row = x.to_dict()\n", + " modes = row['section_modes']\n", + " distances = row['section_distances']\n", + " durations = row['section_durations']\n", + " \n", + " norm_distances = [\n", + " (distances[i] - self.dist[mode][0])/self.dist[mode][1] for i, mode in enumerate(modes)\n", + " ]\n", + " \n", + " norm_durations = [\n", + " (durations[i] - self.dur[mode][0])/self.dur[mode][1] for i, mode in enumerate(modes)\n", + " ]\n", + "\n", + " if ix == 0:\n", + " print(norm_distances, norm_durations)\n", + " \n", + " row['section_distances'] = norm_distances\n", + " row['section_durations'] = norm_durations\n", + "\n", + " rows.append(row)\n", + "\n", + " return pd.DataFrame(data=rows)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "889bd770", + "metadata": {}, + "outputs": [], + "source": [ + "class SPLIT_TYPE(Enum):\n", + " INTRA_USER = 0\n", + " INTER_USER = 1\n", + " TARGET = 2\n", + " MODE = 3\n", + " INTER_USER_STATIC = 4\n", + " \n", + "\n", + "class SPLIT(Enum):\n", + " TRAIN = 0\n", + " TEST = 1\n", + "\n", + "def get_splits(count_df: pd.DataFrame, n:int, test_size=0.2):\n", + " maxsize = int(n * test_size)\n", + "\n", + " max_threshold = int(maxsize * 1.05)\n", + " min_threshold = int(maxsize * 0.95)\n", + "\n", + " print(f\"{min_threshold}, {max_threshold}\")\n", + " \n", + " # Allow a 10% tolerance\n", + " def _dp(ix, curr_size, ids, cache):\n", + " \n", + " if ix >= count_df.shape[0]:\n", + " return []\n", + "\n", + " key = ix\n", + "\n", + " if key in cache:\n", + " return cache[key]\n", + "\n", + " if curr_size > max_threshold:\n", + " return []\n", + "\n", + " if min_threshold <= curr_size <= max_threshold:\n", + " return ids\n", + "\n", + " # two options - either pick the current id or skip it.\n", + " branch_a = _dp(ix, curr_size+count_df.loc[ix, 'count'], ids+[count_df.loc[ix, 'index']], cache)\n", + " branch_b = _dp(ix+1, curr_size, ids, cache)\n", + " \n", + " curr_max = []\n", + " if branch_a and len(branch_a) > 0:\n", + " curr_max = branch_a\n", + " \n", + " if branch_b and len(branch_b) > len(branch_a):\n", + " curr_max = branch_b\n", + " \n", + " cache[key] = curr_max\n", + " return cache[key]\n", + " \n", + " return _dp(0, 0, ids=list(), cache=dict())\n", + "\n", + "\n", + "def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):\n", + "\n", + " n_users = list(data.user_id.unique())\n", + " n = data.shape[0]\n", + " \n", + " if shuffle:\n", + " data = data.sample(data.shape[0], random_state=SEED).reset_index(drop=True, inplace=False)\n", + "\n", + " if how == SPLIT_TYPE.INTER_USER:\n", + " # Make the split, ensuring that a user in one fold is not leaked into the other fold.\n", + " # Basic idea: we want to start with the users with the highest instances and place \n", + " # alternating users in each set.\n", + " counts = data.user_id.value_counts().reset_index(drop=False, inplace=False, name='count')\n", + "\n", + " # Now, start with the user_id at the top, and keep adding to either split.\n", + " # This can be achieved using a simple DP program.\n", + " test_ids = get_splits(counts, data.shape[0])\n", + " test_data = data.loc[data.user_id.isin(test_ids), :]\n", + " train_index = data.index.difference(test_data.index)\n", + " train_data = data.loc[data.user_id.isin(train_index), :]\n", + " \n", + " return train_data, test_data\n", + " \n", + " elif how == SPLIT_TYPE.INTRA_USER:\n", + " \n", + " # There are certain users with only one observation. What do we do with those?\n", + " # As per the mobilitynet modeling pipeline, we randomly assign them to either the\n", + " # training or test set.\n", + " \n", + " value_counts = data.user_id.value_counts()\n", + " single_count_ids = value_counts[value_counts == 1].index\n", + " \n", + " data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,\n", + " random_state=SEED\n", + " )\n", + " \n", + " data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))\n", + " X_tr_merged = pd.concat(\n", + " [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " X_te_merged = pd.concat(\n", + " [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " return X_tr_merged, X_te_merged\n", + " \n", + " elif how == SPLIT_TYPE.TARGET:\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.target,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.MODE:\n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.INTER_USER_STATIC:\n", + " \n", + " train_ids = ['810be63d084746e3b7da9d943dd88e8c', 'bf774cbe6c3040b0a022278d36a23f19', '8a8332a53a1b4cdd9f3680434e91a6ef', \n", + " '5ad862e79a6341f69f28c0096fe884da', '7f89656bd4a94d12ad8e5ad9f0afecaf', 'fbaa338d7cd7457c8cad4d0e60a44d18', \n", + " '3b25446778824941a4c70ae5774f4c68', '28cb1dde85514bbabfd42145bdaf7e0a', '3aeb5494088542fdaf798532951aebb0', \n", + " '531732fee3c24366a286d76eb534aebc', '950f4287bab5444aa0527cc23fb082b2', '737ef8494f26407b8b2a6b1b1dc631a4', \n", + " 'e06cf95717f448ecb81c440b1b2fe1ab', '7347df5e0ac94a109790b31ba2e8a02a', 'bd9cffc8dbf1402da479f9f148ec9e60', \n", + " '2f3b66a5f98546d4b7691fba57fa640f', 'f289f7001bd94db0b33a7d2e1cd28b19', '19a043d1f2414dbcafcca44ea2bd1f19', \n", + " '68788082836e4762b26ad0877643fdcf', '4e8b1b7f026c4384827f157225da13fa', '703a9cee8315441faff7eb63f2bfa93f', \n", + " 'add706b73839413da13344c355dde0bb', '47b5d57bd4354276bb6d2dcd1438901d', 'e4cfb2a8f600426897569985e234636e', \n", + " '0154d71439284c34b865e5a417cd48af', '234f4f2366244fe682dccded2fa7cc4e', '0d0ae3a556414d138c52a6040a203d24', \n", + " '44c10f66dec244d6b8644231d4a8fecb', '30e9b141d7894fbfaacecd2fa18929f9', '0eb313ab00e6469da78cc2d2e94660fb', \n", + " 'fc51d1258e4649ecbfb0e6ecdaeca454', 'a1954793b1454b2f8cf95917d7547169', '6656c04c6cba4c189fed805eaa529741', \n", + " '6a0f3653b80a4c949e127d6504debb55', 'dfe5ca1bb0854b67a6ffccad9565d669', '8b1f3ba43de945bea79de6a81716ad04', \n", + " 'cde34edb8e3a4278a18e0adb062999e5', '6d96909e5ca442ccb5679d9cdf3c8f5b', 'a60a64d82d1c439a901b683b73a74d73', \n", + " '60e6a6f6ed2e4e838f2bbed6a427028d', '88041eddad7542ea8c92b30e5c64e198', '1635c003b1f94a399ebebe21640ffced', \n", + " '1581993b404a4b9c9ca6b0e0b8212316', 'b1aed24c863949bfbfa3a844ecf60593', '4b89612d7f1f4b368635c2bc48bd7993', \n", + " 'eb2e2a5211564a9290fcb06032f9b4af', '26767f9f3da54e93b692f8be6acdac43', '8a98e383a2d143e798fc23869694934a', \n", + " 'b346b83b9f7c4536b809d5f92074fdae', 'd929e7f8b7624d76bdb0ec9ada6cc650', '863e9c6c8ec048c4b7653f73d839c85b', \n", + " 'f50537eb104e4213908f1862c8160a3e', '4a9db5a9bac046a59403b44b883cc0ba', 'cded005d5fd14c64a5bba3f5c4fe8385', \n", + " 'c7ce889c796f4e2a8859fa2d7d5068fe', '405b221abe9e43bc86a57ca7fccf2227', '0b3e78fa91d84aa6a3203440143c8c16', \n", + " 'fbff5e08b7f24a94ab4b2d7371999ef7', 'e35e65107a34496db49fa5a0b41a1e9e', 'd5137ebd4f034dc193d216128bb7fc9a', \n", + " '3f7f2e536ba9481e92f8379b796ad1d0', 'dc75e0b776214e1b9888f6abd042fd95', 'b41dd7d7c6d94fe6afe2fd26fa4ac0bd', \n", + " 'eec6936e1ac347ef9365881845ec74df', '8c7d261fe8284a42a777ffa6f380ba3b', '4baf8c8af7b7445e9067854065e3e612', \n", + " 'c6e4db31c18b4355b02a7dd97deca70b', 'f0db3b1999c2410ba5933103eca9212f', '487e20ab774742378198f94f5b5b0b43', \n", + " 'dc1ed4d71e3645d0993885398d5628ca', '8c3c63abb3ec4fc3a61e7bf316ee4efd', '15eb78dd6e104966ba6112589c29dc41', \n", + " 'c23768ccb817416eaf08be487b2e3643', 'ecd2ae17d5184807abd87a287115c299', '71f21d53b655463784f3a3c63c56707b', \n", + " '2931e0a34319495bbb5898201a54feb5', '92bde0d0662f45ac864629f486cffe77', '42b3ee0bc02a481ab1a94644a8cd7a0d', \n", + " '15aa4ba144a34b8b8079ed7e049d84df', '509b909390934e988eb120b58ed9bd8c', '14103cda12c94642974129989d39e50d', \n", + " '8b0876430c2641bcaea954ea00520e64', 'baa4ff1573ae411183e10aeb17c71c53', '14fe8002bbdc4f97acbd1a00de241bf6', \n", + " '1b7d6dfea8464bcab9321018b10ec9c9', '487ad897ba93404a8cbe5de7d1922691', '5182d93d69754d7ba06200cd1ac5980a', \n", + " '91f3ca1c278247f79a806e49e9cc236f', 'e66e63b206784a559d977d4cb5f1ec34', '840297ae39484e26bfebe83ee30c5b3e', \n", + " 'c6807997194c4c528a8fa8c1f6ee1595', '802667b6371f45b29c7abb051244836a', 'b2bbe715b6a14fd19f751cae8adf6b4e', \n", + " 'feb1d940cd3647d1a101580c2a3b3f8c', '1b9883393ab344a69bc1a0fab192a94c', 'ac604b44fdca482fb753034cb55d1351', \n", + " 'f446bf3102ff4bd99ea1c98f7d2f7af0', 'c2c5d4b9a607487ea405a99c721079d4', '85ddd3c34c58407392953c47a32f5428', \n", + " 'd51de709f95045f8bacf473574b96ba5', '6373dfb8cb9b47e88e8f76adcfadde20', '313d003df34b4bd9823b3474fc93f9f9', \n", + " '53e78583db87421f8decb529ba859ca4', '8fdc9b926a674a9ea07d91df2c5e06f2', '90480ac60a3d475a88fbdab0a003dd5d', \n", + " '7559c3f880f341e898a402eba96a855d', '19a4c2cf718d40588eb96ac25a566353', 'f4427cccaa9442b48b42bedab5ab648e', \n", + " 'e192b8a00b6c422296851c93785deaf7', '355e25bdfc244c5e85d358e39432bd44', 'a0c3a7b410b24e18995f63369a31d123', \n", + " '03a395b4d8614757bb8432b4984559b0', 'a2d48b05d5454d428c0841432c7467b6', '3d981e617b304afab0f21ce8aa6c9786', \n", + " '2cd5668ac9054e2eb2c88bb4ed94bc6d', 'd7a732f4a8644bcbb8dedfc8be242fb2', '367eb90b929d4f6e9470d15c700d2e3f', \n", + " 'e049a7b2a6cb44259f907abbb44c5abc', 'a231added8674bef95092b32bc254ac8', 'e88a8f520dde445484c0a9395e1a0599',\n", + " 'cba570ae38f341faa6257342727377b7', '97953af1b97d4e268c52e1e54dcf421a', 'd200a61757d84b1dab8fbac35ff52c28', \n", + " 'fc68a5bb0a7b4b6386b3f08a69ead36f', '4a8210aec25e443391efb924cc0e5f23', '903742c353ce42c3ad9ab039fc418816', \n", + " '2114e2a75304475fad06ad201948fbad', 'ac917eae407c4deb96625dd0dc2f2ba9', '3dddfb70e7cd40f18a63478654182e9a', \n", + " 'd3735ba212dd4c768e1675dca7bdcb6f', '7abe572148864412a33979592fa985fb', 'd3dff742d07942ca805c2f72e49e12c5' \n", + " ]\n", + " \n", + " X_tr = data.loc[data.user_id.isin(train_ids), :]\n", + " X_te = data.loc[~data.user_id.isin(train_ids), :]\n", + " \n", + " return X_tr, X_te\n", + " \n", + " raise NotImplementedError(\"Unknown split type\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b34ced0", + "metadata": {}, + "outputs": [], + "source": [ + "def drop_columns(df: pd.DataFrame):\n", + " to_drop = [\n", + " 'raw_trip',\n", + " 'start_ts',\n", + " 'start_loc',\n", + " 'start_place',\n", + " 'end_place',\n", + " 'cleaned_trip',\n", + " 'inferred_labels',\n", + " 'inferred_trip',\n", + " 'expectation',\n", + " 'confidence_threshold',\n", + " 'expected_trip',\n", + " 'user_input',\n", + " 'start:year',\n", + " 'start:month',\n", + " 'start:day',\n", + " 'start:hour',\n", + " 'start_local_dt_minute',\n", + " 'start_local_dt_second',\n", + " 'start_local_dt_weekday',\n", + " 'start_local_dt_timezone',\n", + " 'end:year',\n", + " 'end:month',\n", + " 'end:day',\n", + " 'end:hour',\n", + " 'end_local_dt_minute',\n", + " 'end_local_dt_second',\n", + " 'end_local_dt_weekday',\n", + " 'end_local_dt_timezone',\n", + " '_id',\n", + " 'metadata_write_ts',\n", + " 'additions',\n", + " 'mode_confirm',\n", + " 'purpose_confirm',\n", + " 'distance_miles',\n", + " 'Mode_confirm',\n", + " 'Trip_purpose',\n", + " 'original_user_id',\n", + " 'program',\n", + " 'opcode',\n", + " 'Timestamp',\n", + " 'birth_year',\n", + " 'gender_Man',\n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Prefer not to say',\n", + " 'gender_Woman',\n", + " 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", + " 'has_multiple_jobs_No',\n", + " 'has_multiple_jobs_Prefer not to say',\n", + " 'has_multiple_jobs_Yes',\n", + " \"highest_education_Bachelor's degree\",\n", + " 'highest_education_Graduate degree or professional degree',\n", + " 'highest_education_High school graduate or GED',\n", + " 'highest_education_Less than a high school graduate',\n", + " 'highest_education_Prefer not to say',\n", + " 'highest_education_Some college or associates degree',\n", + " 'primary_job_type_Full-time',\n", + " 'primary_job_type_Part-time',\n", + " 'primary_job_type_Prefer not to say',\n", + " 'is_overnight_trip',\n", + " 'n_working_residents',\n", + " 'start_lat',\n", + " 'start_lng',\n", + " 'end_lat',\n", + " 'end_lng',\n", + " 'source', 'end_ts', 'end_fmt_time', 'end_loc',\n", + " ]\n", + "\n", + " # Drop section_mode_argmax and available_modes.\n", + " return df.drop(\n", + " columns=to_drop, \n", + " inplace=False\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "904fa4dc", + "metadata": {}, + "outputs": [], + "source": [ + "processed = drop_columns(data)\n", + "\n", + "train_df, test_df = get_train_test_splits(data=processed, how=SPLIT_TYPE.INTER_USER_STATIC, shuffle=True)\n", + "\n", + "scaler = SectionScaler()\n", + "scaler.compute_stats(train_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44354097", + "metadata": {}, + "outputs": [], + "source": [ + "print(scaler.dist)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c4fee4f-0da8-4391-9528-ef5fd7837365", + "metadata": {}, + "outputs": [], + "source": [ + "train_df = scaler.apply(train_df)\n", + "test_df = scaler.apply(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71eed47f-3a58-4072-8dbe-f5287084f4c4", + "metadata": {}, + "outputs": [], + "source": [ + "train_df.shape, test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d39919", + "metadata": {}, + "outputs": [], + "source": [ + "train_df.section_distances.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff1c96e-7f18-44bd-8df2-f92239114e9e", + "metadata": {}, + "outputs": [], + "source": [ + "class SectionEmbedding(nn.Module):\n", + " def __init__(self, input_dim, emb_dim=32):\n", + " super(SectionEmbedding, self).__init__()\n", + " self.dpt = nn.Dropout(0.2)\n", + " self.encoder = nn.Linear(input_dim, emb_dim)\n", + " self.decoder = nn.Linear(emb_dim, input_dim)\n", + " self.act = nn.LeakyReLU()\n", + " \n", + " def forward(self, x):\n", + " '''\n", + " Input will be a one-hot encoded matrix, where nrows=number of modes, ncols=input_dim\n", + " dim = (B, N, D)\n", + " \n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97a8d6b2", + "metadata": {}, + "outputs": [], + "source": [ + "class ReplacedModeDataset(Dataset):\n", + " def __init__(self, df: pd.DataFrame):\n", + " self.data = df\n", + " \n", + " def __len__(self):\n", + " return len(self.data.ix.unique())\n", + " \n", + " def __getitem__(self, ix):\n", + " \n", + " # Could be between 1 - 15.\n", + " sequence = self.data.loc[self.data.ix == ix, :]\n", + " \n", + " # Static features that do not vary with time.\n", + " demographic_features = ['n_residence_members', \n", + " 'primary_job_commute_time', 'income_category',\n", + " 'n_residents_u18', 'n_residents_with_license', 'n_motor_vehicles', 'age', \n", + " 'p_micro', 'walk', 's_micro', 'ridehail', 'car', 'transit', 's_car', 'no_trip', 'unknown',\n", + " 'has_drivers_license_No', 'has_drivers_license_Prefer not to say', 'has_drivers_license_Yes', \n", + " 'primary_job_description_Clerical or administrative support', 'primary_job_description_Custodial', \n", + " 'primary_job_description_Education', 'primary_job_description_Food service', \n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming', \n", + " 'primary_job_description_Medical/healthcare', 'primary_job_description_Other', \n", + " 'primary_job_description_Professional, managerial, or technical', \n", + " 'primary_job_description_Sales or service', 'primary_job_commute_mode_Active transport', \n", + " 'primary_job_commute_mode_Car transport', 'primary_job_commute_mode_Hybrid', \n", + " 'primary_job_commute_mode_Public transport', 'primary_job_commute_mode_Unknown', \n", + " 'primary_job_commute_mode_WFH', 'duration', 'distance']\n", + " \n", + " seq_features = ['section_distances', 'section_durations', 'section_modes', 'mph']\n", + " \n", + " weather_features = ['temperature_2m (°F)', \n", + " 'relative_humidity_2m (%)', 'dew_point_2m (°F)', 'rain (inch)', 'snowfall (inch)', \n", + " 'wind_speed_10m (mp/h)', 'wind_gusts_10m (mp/h)']\n", + " \n", + " return (\n", + " sequence[seq_features], sequence[demographic_features], \n", + " sequence[weather_features], sequence['target']\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b36058", + "metadata": {}, + "outputs": [], + "source": [ + "dset = ReplacedModeDataset(train_df)\n", + "\n", + "print(dset.__getitem__(20))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02b78758", + "metadata": {}, + "outputs": [], + "source": [ + "train_dset = CustomDataset(train_df)\n", + "test_dset = CustomDataset(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "627b6fa4", + "metadata": {}, + "outputs": [], + "source": [ + "def collate(batch):\n", + " X, y = zip(*batch)\n", + " \n", + " seq_modes = [x[0] for x in X]\n", + " seq_metrics = [x[1] for x in X]\n", + " features = [x[-1] for x in X]\n", + "\n", + " padded_seq = pad_sequence([s for s in seq_modes], batch_first=True)\n", + " padded_metrics = pad_sequence([m for m in seq_metrics], batch_first=True)\n", + " lengths = [len(seq) for seq in seq_modes]\n", + " stacked_features = torch.stack(features)\n", + "\n", + " return (padded_seq, padded_metrics, stacked_features), torch.stack(y), lengths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ca34681", + "metadata": {}, + "outputs": [], + "source": [ + "train_loader = DataLoader(train_dset, batch_size=16, collate_fn=collate, shuffle=True, drop_last=False)\n", + "test_loader = DataLoader(test_dset, batch_size=16, collate_fn=collate, shuffle=True, drop_last=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31ca5ab1", + "metadata": {}, + "outputs": [], + "source": [ + "(modes, metrics, features), sY1, lX = next(iter(train_loader))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eb5a93a", + "metadata": {}, + "outputs": [], + "source": [ + "metrics.size(), modes.size()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0abf380", + "metadata": {}, + "outputs": [], + "source": [ + "# Set to 0 for no dropout.\n", + "DROPOUT = 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48871ea4", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "\n", + "class GELU_new(nn.Module):\n", + " \"\"\"\n", + " Taken from OpenAI GPT-2 implementation.\n", + " \"\"\"\n", + " \n", + " def __init__(self):\n", + " super(GELU_new, self).__init__()\n", + " \n", + " def forward(self, x):\n", + " return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))\n", + "\n", + "\n", + "class DilatedBlock(nn.Module):\n", + " def __init__(self, n_c):\n", + " super(DilatedBlock, self).__init__()\n", + " \n", + " self.block = nn.Sequential(\n", + " nn.Linear(n_c, 4*n_c, bias=False),\n", + " GELU_new(),\n", + " nn.Linear(4*n_c, n_c, bias=False),\n", + " nn.Dropout(DROPOUT)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " return self.block(x)\n", + "\n", + " \n", + "class SelfAttention(nn.Module):\n", + " def __init__(self, n_features, head_size):\n", + " super(SelfAttention, self).__init__()\n", + " # in: (B, F, 64)\n", + " self.k = nn.Linear(n_features, head_size, bias=False)\n", + " self.q = nn.Linear(n_features, head_size, bias=False)\n", + " self.v = nn.Linear(n_features, head_size, bias=False)\n", + " self.dpt = nn.Dropout(DROPOUT)\n", + " self.sqrt_d = torch.sqrt(torch.tensor(head_size))\n", + " \n", + " def forward(self, x):\n", + " k = self.k(x)\n", + " q = self.q(x)\n", + " v = self.v(x)\n", + " \n", + " # Q.K.t\n", + " dot = torch.bmm(q, k.permute(0, 2, 1))\n", + " \n", + " # normalize dot product.\n", + " dot /= self.sqrt_d\n", + " \n", + " # softmax over -1 dim.\n", + " softmax = self.dpt(torch.softmax(dot, dim=-1))\n", + " \n", + " # dot with values. (B, F, F) * (B, F, x) = (B, F, x)\n", + " return torch.bmm(softmax, v)\n", + " \n", + "\n", + "class MultiHeadAttention(nn.Module):\n", + " def __init__(self, n_heads, n_dim):\n", + " super(MultiHeadAttention, self).__init__()\n", + " \n", + " # 64 dims, 4 heads => 16 dims per head.\n", + " head_size = n_dim//n_heads\n", + " self.heads = nn.ModuleList([SelfAttention(n_dim, head_size) for _ in range(n_heads)])\n", + " self.proj = nn.Linear(n_dim, n_dim, bias=False)\n", + " \n", + " def forward(self, x):\n", + " # x is (B, seq, n_dim)\n", + " cat = torch.cat([head(x) for head in self.heads], dim=-1)\n", + " return self.proj(cat)\n", + "\n", + "\n", + "class Block(nn.Module):\n", + " def __init__(self, n_c):\n", + " super(Block, self).__init__()\n", + " \n", + " self.sa = MultiHeadAttention(n_heads=4, n_dim=n_c)\n", + " self.dilated = DilatedBlock(n_c)\n", + " self.ln1 = nn.LayerNorm(n_c)\n", + " self.ln2 = nn.LayerNorm(n_c)\n", + " \n", + " \n", + " def forward(self, x):\n", + " x = x + self.sa(self.ln1(x))\n", + " x = x + self.dilated(self.ln2(x))\n", + " return x\n", + " \n", + "\n", + "class LSTMLayer(nn.Module):\n", + " def __init__(\n", + " self, input_size: int, hidden_size: int, \n", + " output_size: int, n_lstm_layers: int = 1\n", + " ):\n", + " super(LSTMLayer, self).__init__()\n", + " \n", + " n_embed_mode = 16\n", + " \n", + " self.hidden_size = hidden_size\n", + " self.embedding = nn.Embedding(7, n_embed_mode, padding_idx=0)\n", + " self.dpt = nn.Dropout(DROPOUT)\n", + " \n", + " self.lstm = nn.LSTM(\n", + " input_size=input_size + n_embed_mode,\n", + " hidden_size=hidden_size,\n", + " bias=False,\n", + " bidirectional=True,\n", + " batch_first=True,\n", + " num_layers=n_lstm_layers\n", + " )\n", + " \n", + " def forward(self, modes, x, lengths):\n", + " mode_emb = self.embedding(modes)\n", + " x = torch.cat([x, mode_emb], dim=-1)\n", + " \n", + " packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)\n", + " out, _ = self.lstm(packed)\n", + " unpacked, _ = pad_packed_sequence(out, batch_first=True)\n", + " \n", + " return self.dpt(unpacked)\n", + "\n", + "\n", + "class Model(nn.Module):\n", + " def __init__(\n", + " self, input_size: int, hidden_size: int, output_size: int, \n", + " n_features: int, n_lstm_layers: int = 1, **kwargs\n", + " ):\n", + " super(Model, self).__init__()\n", + " \n", + " block1_ip_dim = hidden_size*2\n", + " block2_ip_dim = (hidden_size*2) + n_features\n", + " \n", + " self.lstm = LSTMLayer(\n", + " input_size, hidden_size, \n", + " output_size, n_lstm_layers\n", + " )\n", + " \n", + " self.block_l1 = nn.ModuleList([Block(block1_ip_dim) for _ in range(kwargs['l1_blocks'])])\n", + " self.block_l2 = nn.ModuleList([Block(block2_ip_dim) for _ in range(kwargs['l2_blocks'])])\n", + " self.final_proj = nn.Linear(block2_ip_dim, output_size, bias=True)\n", + " \n", + " def forward(self, modes, x, features, lengths):\n", + " \n", + " b = x.size(0)\n", + " \n", + " # Out = (B, seq, hidden*2)\n", + " lstm_out = self.lstm(modes, x, lengths)\n", + " \n", + " # Pass the raw output through the blocks.\n", + " for module in self.block_l1:\n", + " lstm_out = module(lstm_out)\n", + " \n", + " features_rshp = features.unsqueeze(1).expand(b, lstm_out.size(1), -1)\n", + " \n", + " # Out = (B, seq, n+40)\n", + " cat = torch.cat([lstm_out, features_rshp], dim=-1)\n", + " \n", + " for module in self.block_l2:\n", + " cat = module(cat)\n", + " \n", + " # (8, 3, 104) -> (B, 104)\n", + " # flattened = cat.view(b, -1)\n", + " \n", + " # proj = self.runtime_ffw(flattened.size(-1), 64)(flattened)\n", + " proj = cat.mean(dim=1)\n", + " \n", + " return self.final_proj(proj)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70b4d4ea", + "metadata": {}, + "outputs": [], + "source": [ + "import torch.nn.init as init\n", + "\n", + "def init_weights(module):\n", + " if isinstance(module, nn.Embedding):\n", + " module.weight.data.normal_(mean=0.0, std=1.0)\n", + " if module.padding_idx is not None:\n", + " module.weight.data[module.padding_idx].zero_()\n", + " elif isinstance(module, nn.LayerNorm):\n", + " module.bias.data.zero_()\n", + " module.weight.data.fill_(1.0)\n", + " elif isinstance(module, nn.BatchNorm1d):\n", + " init.normal_(m.weight.data, mean=1, std=0.02)\n", + " init.constant_(m.bias.data, 0)\n", + " elif isinstance(module, nn.Linear):\n", + " init.xavier_normal_(module.weight.data)\n", + " if module.bias is not None:\n", + " init.normal_(module.bias.data)\n", + " elif isinstance(module, nn.LSTM):\n", + " for param in module.parameters():\n", + " if len(param.shape) >= 2:\n", + " init.orthogonal_(param.data)\n", + " else:\n", + " init.normal_(param.data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "282ecd1a", + "metadata": {}, + "outputs": [], + "source": [ + "model = Model(\n", + " n_lstm_layers=3,\n", + " input_size=3,\n", + " hidden_size=32, \n", + " output_size=num_classes,\n", + " n_features=40,\n", + " l1_blocks=4,\n", + " l2_blocks=4\n", + ")\n", + "\n", + "model = model.apply(init_weights)\n", + "\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20fec22b", + "metadata": {}, + "outputs": [], + "source": [ + "print(sum(p.numel() for p in model.parameters()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ca4b65a", + "metadata": {}, + "outputs": [], + "source": [ + "weights = train_df.shape[0]/(np.bincount(train_df.chosen.values) * len(np.unique(train_df.chosen)))\n", + "\n", + "print(weights)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7a2017b", + "metadata": {}, + "outputs": [], + "source": [ + "INIT_LR = 1e-3\n", + "optimizer = optim.Adam(model.parameters(), lr=INIT_LR)\n", + "criterion = nn.CrossEntropyLoss(weight=torch.Tensor(weights))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5bbda7c", + "metadata": {}, + "outputs": [], + "source": [ + "class Trainer:\n", + " def __init__(self, model, tr_loader, te_loader):\n", + " pass\n", + " \n", + " def set_optim_params(self, **kwargs):\n", + " pass\n", + " \n", + " def set_criterion(self, **kwargs):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e53e4fd1", + "metadata": {}, + "outputs": [], + "source": [ + "def train(epoch, model, loader, opt, criterion, val_ix):\n", + " \n", + " print(\"\\tBeginning training.\")\n", + " \n", + " n_batches = len(loader)\n", + " print_every = n_batches//5\n", + " \n", + " train_losses, val_losses = [], []\n", + " \n", + " for ix, (X, y, lengths) in enumerate(loader):\n", + " \n", + " # Unpack X.\n", + " modes, metrics, features = X\n", + " # Cast y to appropriate type.\n", + " y = y.float()\n", + " \n", + " if ix in val_ix:\n", + " model.eval()\n", + " with torch.no_grad():\n", + " y_pred = model(modes, metrics.float(), features.float(), lengths)\n", + " loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))\n", + " val_losses.append(loss.item())\n", + " else:\n", + " model.train()\n", + " \n", + " opt.zero_grad()\n", + "\n", + " y_pred = model(modes, metrics.float(), features.float(), lengths)\n", + " loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))\n", + " train_losses.append(loss.item())\n", + "\n", + " loss.backward()\n", + "\n", + " optimizer.step()\n", + " \n", + " if ix and ix % print_every == 0:\n", + " print(\n", + " f\"\\t-> Train loss: {np.nanmean(train_losses)}\\n\\t-> Val loss: {np.nanmean(val_losses)}\"\n", + " )\n", + " print('\\t'+20*'*')\n", + "\n", + " print(50*'-')\n", + " return train_losses, val_losses" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a33fefa", + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(model, loader, criterion):\n", + " \n", + " print(\"\\tBeginning evaluation.\")\n", + " \n", + " model.eval()\n", + " \n", + " print_every = len(loader)//5\n", + " \n", + " losses = []\n", + " \n", + " for ix, (X, y, lengths) in enumerate(loader):\n", + " \n", + " modes, metrics, features = X\n", + "\n", + " y_pred = model(modes, metrics.float(), features.float(), lengths)\n", + " y = y.float()\n", + " \n", + " loss = criterion(y_pred.view(-1, num_classes), y.view(-1, num_classes))\n", + "\n", + " losses.append(loss.item())\n", + " \n", + " if ix and ix % print_every == 0:\n", + " print(f\"\\t -> Average loss: {np.nanmean(losses)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "650a5240", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import f1_score\n", + "\n", + "\n", + "def evaluate_f1(model, tr_loader, val_ix, te_loader=None):\n", + " \n", + " tr_preds, val_preds, te_preds = np.array([]), np.array([]), np.array([])\n", + " tr_gt, val_gt, te_gt = np.array([]), np.array([]), np.array([])\n", + " \n", + " model.eval()\n", + " print(\"\\tEvaluating F1...\")\n", + " \n", + " with torch.no_grad():\n", + " for ix, (X, y, lengths) in enumerate(tr_loader):\n", + " \n", + " modes, metrics, features = X\n", + "\n", + " y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, num_classes)\n", + " y = y.float().view(-1, num_classes)\n", + "\n", + " preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()\n", + " true = torch.argmax(y.long(), dim=-1).numpy().ravel()\n", + " \n", + " if ix in val_ix:\n", + " val_preds = np.append(val_preds, preds)\n", + " val_gt = np.append(val_gt, true)\n", + " else:\n", + " tr_preds = np.append(tr_preds, preds)\n", + " tr_gt = np.append(tr_gt, true)\n", + " \n", + " tr_f1 = f1_score(y_true=tr_gt, y_pred=tr_preds, average='weighted')\n", + " val_f1 = f1_score(y_true=val_gt, y_pred=val_preds, average='weighted')\n", + " print(f\"\\t -> Train F1: {tr_f1}, Val F1: {val_f1}\")\n", + " \n", + " if not te_loader:\n", + " return tr_f1, val_f1, None\n", + "\n", + " for ix, (X, y, lengths) in enumerate(te_loader):\n", + " \n", + " modes, metrics, features = X\n", + "\n", + " y_pred = model(modes, metrics.float(), features.float(), lengths).view(-1, num_classes)\n", + " y = y.float().view(-1, num_classes)\n", + " \n", + " preds = torch.argmax(F.softmax(y_pred, dim=-1), dim=-1).numpy().ravel()\n", + " true = torch.argmax(y.long(), dim=-1).numpy().ravel()\n", + "\n", + " te_preds = np.append(te_preds, preds)\n", + " te_gt = np.append(te_gt, true)\n", + " \n", + " te_f1 = f1_score(y_true=te_gt, y_pred=te_preds, average='weighted')\n", + " print(f\"\\t -> Test F1: {te_f1}\")\n", + " \n", + " return tr_f1, val_f1, te_f1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7191e78b", + "metadata": {}, + "outputs": [], + "source": [ + "# Other training hyperparameters.\n", + "num_epochs = 18\n", + "num_decays = 6\n", + "decay_at = num_epochs // num_decays\n", + "decay = 0.9\n", + "eval_every = 3\n", + "\n", + "# Static hold-out val set.\n", + "n_batches = len(train_loader)\n", + "val_split = 0.2\n", + "val_batches = np.random.choice(n_batches, size=(int(val_split * n_batches),), replace=False)\n", + "\n", + "# Just checking what LRs should be after decaying.\n", + "for power in range(num_decays):\n", + " print(f\"{decay_at * power} - {decay_at * (power + 1)} :: {INIT_LR * decay**power:.5f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc4b72de", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# We'd like to start at a loss of at most -ln(1/9) ~ 2.19\n", + "\n", + "# Wrapper to contain all losses.\n", + "tr_losses, val_losses = list(), list()\n", + "save_at_best_loss = True\n", + "best_loss = np.inf\n", + "model_name = \"../models/LSTM_{epoch}_{loss}.pt\"\n", + "patience, delta = 2, 0\n", + "\n", + "for epoch_ix in range(1, num_epochs+1):\n", + " print(f\"Epoch {epoch_ix}:\")\n", + " tr_loss, val_loss = train(epoch_ix, model, train_loader, optimizer, criterion, val_batches)\n", + " \n", + " tr_losses.extend(tr_loss)\n", + " val_losses.extend(val_loss)\n", + " \n", + " mean_val_loss = np.nanmean(val_loss)\n", + " \n", + " if epoch_ix and epoch_ix % eval_every == 0:\n", + " # evaluate(epoch_ix, model, test_loader, criterion)\n", + " tr_f1, val_f1, _ = evaluate_f1(model, train_loader, val_batches)\n", + " \n", + " if mean_val_loss < best_loss and save_at_best_loss:\n", + " best_loss = mean_val_loss\n", + " \n", + " # Reset delta.\n", + " delta = 0\n", + " \n", + " loss_str = str(best_loss).replace(\".\", \"_\")\n", + " torch.save(model.state_dict(), model_name.format(epoch=str(epoch_ix), loss=loss_str))\n", + " print(\"\\tSaved model checkpoint.\")\n", + " else:\n", + " # Increase delta by 1.\n", + " delta += 1\n", + " print(f\"\\tLoss did not decrease. Status is now {delta}/{patience}\")\n", + " \n", + " # Tolerate for `patience` epochs.\n", + " if delta == patience + 1:\n", + " # Stop training.\n", + " break\n", + "\n", + " if epoch_ix % decay_at == 0:\n", + " optimizer.param_groups[0]['lr'] *= decay\n", + " print(f\"\\tLearning rate is now: {optimizer.param_groups[0]['lr']:.5f}\")\n", + " \n", + " print(50*'-')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bd1ffc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate once on the test set.\n", + "evaluate(model, test_loader, criterion)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "396b615f", + "metadata": {}, + "outputs": [], + "source": [ + "final_tr_f1, final_val_f1, te_f1 = evaluate_f1(model, train_loader, val_batches, test_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bcc396c", + "metadata": {}, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots(figsize=(10, 6))\n", + "# ax.plot(tr_losses, 'r-')\n", + "# ax.plot(val_losses, 'b-')\n", + "# ax.set_title('Training and Validation losses')\n", + "# plt.legend(['Training loss', 'Validation loss'], loc='best')\n", + "# plt.tight_layout()\n", + "# plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "a7d53498", + "metadata": {}, + "source": [ + "## Benchmarking\n", + "\n", + "\n", + "\n", + "epochs = 30\n", + "\n", + "```\n", + "LR scheme:\n", + "0 - 5 :: 0.00070\n", + "5 - 10 :: 0.00067\n", + "10 - 15 :: 0.00063\n", + "15 - 20 :: 0.00060\n", + "20 - 25 :: 0.00057\n", + "25 - 30 :: 0.00054\n", + "```\n", + "\n", + "```language=python\n", + "model = Model(\n", + " n_lstm_layers=1,\n", + " input_size=3,\n", + " hidden_size=16, \n", + " output_size=9,\n", + " n_features=40,\n", + " l1_blocks=6,\n", + " l2_blocks=6\n", + ")\n", + "```\n", + "\n", + "\\# params: ~450k\n", + "\n", + "mode_embedding = 4\n", + "\n", + "Best stats:\n", + "\t -> Train F1: 0.7047532574096045\n", + "\t -> Test F1: 0.6560129685481192\n", + "\n", + "
\n", + "\n", + "epochs = 40\n", + "\n", + "Same LR scheme as above.\n", + "\n", + "```language=python\n", + "model = Model(\n", + " n_lstm_layers=3,\n", + " input_size=3,\n", + " hidden_size=32, \n", + " output_size=9,\n", + " n_features=40,\n", + " l1_blocks=4,\n", + " l2_blocks=4\n", + ")\n", + "```\n", + "\n", + "\\# params: 770k\n", + "\n", + "mode_embedding = 4\n", + "\n", + "Best stats:\n", + "\t -> Train F1: 0.7365035440256072\n", + "\t -> Test F1: 0.6610215030981759\n", + " \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46a8dc7d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch", + "language": "python", + "name": "pytorch" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/replacement_mode_modeling/experimental_notebooks/README.md b/replacement_mode_modeling/experimental_notebooks/README.md new file mode 100644 index 0000000..13bb9e8 --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/README.md @@ -0,0 +1,3 @@ +# All these scripts and notebooks are not verified to run, + +I am simply pushing these in this directory so that it might be of help for reference or to know what has already been tried. Please do not expect these notebooks to run seamlessly since a lot of them rely on intermediate pre-processed data. \ No newline at end of file diff --git a/replacement_mode_modeling/experimental_notebooks/baseline_modeling0.ipynb b/replacement_mode_modeling/experimental_notebooks/baseline_modeling0.ipynb new file mode 100644 index 0000000..5447229 --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/baseline_modeling0.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook used for extensive experimentation on trip-level models with AllCEO data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All experiments are logged in Notion [here](https://www.notion.so/Replacement-mode-modeling-257c2f460377498d921e6b167f465945)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from enum import Enum\n", + "import random\n", + "\n", + "# Math and graphing.\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# sklearn imports.\n", + "from sklearn.model_selection import train_test_split, StratifiedGroupKFold, GroupKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import f1_score, r2_score, ConfusionMatrixDisplay\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Global experiment flags and variables.\n", + "SEED = 19348\n", + "TARGETS = ['p_micro', 'no_trip', 's_car', 'transit', 'car', 's_micro', 'ridehail', 'walk', 'unknown']\n", + "\n", + "DROP_S_MICRO = True\n", + "\n", + "# Set the Numpy seed too.\n", + "random.seed(SEED)\n", + "np.random.seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SPLIT_TYPE(Enum):\n", + " INTRA_USER = 0\n", + " INTER_USER = 1\n", + " TARGET = 2\n", + " MODE = 3\n", + " INTER_USER_STATIC = 4\n", + " \n", + "\n", + "class SPLIT(Enum):\n", + " TRAIN = 0\n", + " TEST = 1\n", + "\n", + "\n", + "def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):\n", + " \n", + " if how == SPLIT_TYPE.INTER_USER:\n", + "\n", + " X = data.drop(columns=['target'])\n", + " y = data['target'].values\n", + " groups = data.user_id.values\n", + " \n", + " splitter = StratifiedGroupKFold(n_splits=5, shuffle=shuffle, random_state=SEED)\n", + " # splitter = GroupKFold(n_splits=5)\n", + " \n", + " for train_index, test_index in splitter.split(X, y, groups):\n", + " X_tr = data.iloc[train_index, :]\n", + " X_te = data.iloc[test_index, :]\n", + " \n", + " # Iterate only once and break.\n", + " break\n", + "\n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.INTRA_USER:\n", + " \n", + " # There are certain users with only one observation. What do we do with those?\n", + " # As per the mobilitynet modeling pipeline, we randomly assign them to either the\n", + " # training or test set.\n", + " \n", + " value_counts = data.user_id.value_counts()\n", + " single_count_ids = value_counts[value_counts == 1].index\n", + " \n", + " data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id,\n", + " random_state=SEED\n", + " )\n", + " \n", + " data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))\n", + " X_tr_merged = pd.concat(\n", + " [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " X_te_merged = pd.concat(\n", + " [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " return X_tr_merged, X_te_merged\n", + " \n", + " elif how == SPLIT_TYPE.TARGET:\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.target,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.MODE:\n", + " X_tr, X_te = train_test_split(\n", + " data, test_size=test_ratio, shuffle=shuffle, stratify=data.section_mode_argmax,\n", + " random_state=SEED\n", + " )\n", + " \n", + " return X_tr, X_te\n", + " \n", + " elif how == SPLIT_TYPE.INTER_USER_STATIC:\n", + " \n", + " train_ids = ['810be63d084746e3b7da9d943dd88e8c', 'bf774cbe6c3040b0a022278d36a23f19', '8a8332a53a1b4cdd9f3680434e91a6ef', \n", + " '5ad862e79a6341f69f28c0096fe884da', '7f89656bd4a94d12ad8e5ad9f0afecaf', 'fbaa338d7cd7457c8cad4d0e60a44d18', \n", + " '3b25446778824941a4c70ae5774f4c68', '28cb1dde85514bbabfd42145bdaf7e0a', '3aeb5494088542fdaf798532951aebb0', \n", + " '531732fee3c24366a286d76eb534aebc', '950f4287bab5444aa0527cc23fb082b2', '737ef8494f26407b8b2a6b1b1dc631a4', \n", + " 'e06cf95717f448ecb81c440b1b2fe1ab', '7347df5e0ac94a109790b31ba2e8a02a', 'bd9cffc8dbf1402da479f9f148ec9e60', \n", + " '2f3b66a5f98546d4b7691fba57fa640f', 'f289f7001bd94db0b33a7d2e1cd28b19', '19a043d1f2414dbcafcca44ea2bd1f19', \n", + " '68788082836e4762b26ad0877643fdcf', '4e8b1b7f026c4384827f157225da13fa', '703a9cee8315441faff7eb63f2bfa93f', \n", + " 'add706b73839413da13344c355dde0bb', '47b5d57bd4354276bb6d2dcd1438901d', 'e4cfb2a8f600426897569985e234636e', \n", + " '0154d71439284c34b865e5a417cd48af', '234f4f2366244fe682dccded2fa7cc4e', '0d0ae3a556414d138c52a6040a203d24', \n", + " '44c10f66dec244d6b8644231d4a8fecb', '30e9b141d7894fbfaacecd2fa18929f9', '0eb313ab00e6469da78cc2d2e94660fb', \n", + " 'fc51d1258e4649ecbfb0e6ecdaeca454', 'a1954793b1454b2f8cf95917d7547169', '6656c04c6cba4c189fed805eaa529741', \n", + " '6a0f3653b80a4c949e127d6504debb55', 'dfe5ca1bb0854b67a6ffccad9565d669', '8b1f3ba43de945bea79de6a81716ad04', \n", + " 'cde34edb8e3a4278a18e0adb062999e5', '6d96909e5ca442ccb5679d9cdf3c8f5b', 'a60a64d82d1c439a901b683b73a74d73', \n", + " '60e6a6f6ed2e4e838f2bbed6a427028d', '88041eddad7542ea8c92b30e5c64e198', '1635c003b1f94a399ebebe21640ffced', \n", + " '1581993b404a4b9c9ca6b0e0b8212316', 'b1aed24c863949bfbfa3a844ecf60593', '4b89612d7f1f4b368635c2bc48bd7993', \n", + " 'eb2e2a5211564a9290fcb06032f9b4af', '26767f9f3da54e93b692f8be6acdac43', '8a98e383a2d143e798fc23869694934a', \n", + " 'b346b83b9f7c4536b809d5f92074fdae', 'd929e7f8b7624d76bdb0ec9ada6cc650', '863e9c6c8ec048c4b7653f73d839c85b', \n", + " 'f50537eb104e4213908f1862c8160a3e', '4a9db5a9bac046a59403b44b883cc0ba', 'cded005d5fd14c64a5bba3f5c4fe8385', \n", + " 'c7ce889c796f4e2a8859fa2d7d5068fe', '405b221abe9e43bc86a57ca7fccf2227', '0b3e78fa91d84aa6a3203440143c8c16', \n", + " 'fbff5e08b7f24a94ab4b2d7371999ef7', 'e35e65107a34496db49fa5a0b41a1e9e', 'd5137ebd4f034dc193d216128bb7fc9a', \n", + " '3f7f2e536ba9481e92f8379b796ad1d0', 'dc75e0b776214e1b9888f6abd042fd95', 'b41dd7d7c6d94fe6afe2fd26fa4ac0bd', \n", + " 'eec6936e1ac347ef9365881845ec74df', '8c7d261fe8284a42a777ffa6f380ba3b', '4baf8c8af7b7445e9067854065e3e612', \n", + " 'c6e4db31c18b4355b02a7dd97deca70b', 'f0db3b1999c2410ba5933103eca9212f', '487e20ab774742378198f94f5b5b0b43', \n", + " 'dc1ed4d71e3645d0993885398d5628ca', '8c3c63abb3ec4fc3a61e7bf316ee4efd', '15eb78dd6e104966ba6112589c29dc41', \n", + " 'c23768ccb817416eaf08be487b2e3643', 'ecd2ae17d5184807abd87a287115c299', '71f21d53b655463784f3a3c63c56707b', \n", + " '2931e0a34319495bbb5898201a54feb5', '92bde0d0662f45ac864629f486cffe77', '42b3ee0bc02a481ab1a94644a8cd7a0d', \n", + " '15aa4ba144a34b8b8079ed7e049d84df', '509b909390934e988eb120b58ed9bd8c', '14103cda12c94642974129989d39e50d', \n", + " '8b0876430c2641bcaea954ea00520e64', 'baa4ff1573ae411183e10aeb17c71c53', '14fe8002bbdc4f97acbd1a00de241bf6', \n", + " '1b7d6dfea8464bcab9321018b10ec9c9', '487ad897ba93404a8cbe5de7d1922691', '5182d93d69754d7ba06200cd1ac5980a', \n", + " '91f3ca1c278247f79a806e49e9cc236f', 'e66e63b206784a559d977d4cb5f1ec34', '840297ae39484e26bfebe83ee30c5b3e', \n", + " 'c6807997194c4c528a8fa8c1f6ee1595', '802667b6371f45b29c7abb051244836a', 'b2bbe715b6a14fd19f751cae8adf6b4e', \n", + " 'feb1d940cd3647d1a101580c2a3b3f8c', '1b9883393ab344a69bc1a0fab192a94c', 'ac604b44fdca482fb753034cb55d1351', \n", + " 'f446bf3102ff4bd99ea1c98f7d2f7af0', 'c2c5d4b9a607487ea405a99c721079d4', '85ddd3c34c58407392953c47a32f5428', \n", + " 'd51de709f95045f8bacf473574b96ba5', '6373dfb8cb9b47e88e8f76adcfadde20', '313d003df34b4bd9823b3474fc93f9f9', \n", + " '53e78583db87421f8decb529ba859ca4', '8fdc9b926a674a9ea07d91df2c5e06f2', '90480ac60a3d475a88fbdab0a003dd5d', \n", + " '7559c3f880f341e898a402eba96a855d', '19a4c2cf718d40588eb96ac25a566353', 'f4427cccaa9442b48b42bedab5ab648e', \n", + " 'e192b8a00b6c422296851c93785deaf7', '355e25bdfc244c5e85d358e39432bd44', 'a0c3a7b410b24e18995f63369a31d123', \n", + " '03a395b4d8614757bb8432b4984559b0', 'a2d48b05d5454d428c0841432c7467b6', '3d981e617b304afab0f21ce8aa6c9786', \n", + " '2cd5668ac9054e2eb2c88bb4ed94bc6d', 'd7a732f4a8644bcbb8dedfc8be242fb2', '367eb90b929d4f6e9470d15c700d2e3f', \n", + " 'e049a7b2a6cb44259f907abbb44c5abc', 'a231added8674bef95092b32bc254ac8', 'e88a8f520dde445484c0a9395e1a0599',\n", + " 'cba570ae38f341faa6257342727377b7', '97953af1b97d4e268c52e1e54dcf421a', 'd200a61757d84b1dab8fbac35ff52c28', \n", + " 'fc68a5bb0a7b4b6386b3f08a69ead36f', '4a8210aec25e443391efb924cc0e5f23', '903742c353ce42c3ad9ab039fc418816', \n", + " '2114e2a75304475fad06ad201948fbad', 'ac917eae407c4deb96625dd0dc2f2ba9', '3dddfb70e7cd40f18a63478654182e9a', \n", + " 'd3735ba212dd4c768e1675dca7bdcb6f', '7abe572148864412a33979592fa985fb', 'd3dff742d07942ca805c2f72e49e12c5' \n", + " ]\n", + " \n", + " X_tr = data.loc[data.user_id.isin(train_ids), :]\n", + " X_te = data.loc[~data.user_id.isin(train_ids), :]\n", + " \n", + " return X_tr, X_te\n", + " \n", + " raise NotImplementedError(\"Unknown split type\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the data.\n", + "data = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if DROP_S_MICRO:\n", + " data.drop(\n", + " index=data.loc[data.target == 6, :].index,\n", + " inplace=True\n", + " )\n", + " \n", + " # Shift all values after 6 by -1\n", + " data.loc[data.target > 5, 'target'] -= 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_hist(df, features=None):\n", + " if not features:\n", + " # All features.\n", + " features = df.columns.tolist()\n", + " \n", + " n_features = len(features)\n", + " \n", + " ncols = 6\n", + " nrows = n_features//ncols if n_features%ncols == 0 else (n_features//ncols) + 1\n", + " \n", + " fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 10))\n", + " for ix, ax in enumerate(axes.flatten()):\n", + " \n", + " if ix > n_features:\n", + " break\n", + " \n", + " df[features[ix]].hist(ax=ax)\n", + " ax.set(title=features[ix])\n", + " \n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First, we map the user IDs to ints.\n", + "\n", + "# USERS = list(data.user_id.unique())\n", + "\n", + "# USER_MAP = {\n", + "# u: i+1 for (i, u) in enumerate(USERS)\n", + "# }\n", + "\n", + "# data['user_id'] = data['user_id'].apply(lambda x: USER_MAP[x])\n", + "\n", + "# data.rename(\n", + "# columns={'start_local_dt_weekday': 'start:DOW', 'end_local_dt_weekday': 'end:DOW'},\n", + "# inplace=True\n", + "# )\n", + "\n", + "# Drop the samples with chosen == no trip or chosen == unknown\n", + "# data.drop(index=data.loc[data.chosen.isin([2, 9])].index, inplace=True)\n", + "\n", + "# data.n_working_residents = data.n_working_residents.apply(lambda x: 0 if x < 0 else x)\n", + "\n", + "# Fix some age preprocessing issues.\n", + "# data.age = data.age.apply(lambda x: x if x < 100 else 2024-x)\n", + "\n", + "# Collapse 'train' and 'bus' into 'transit'\n", + "# data.loc[\n", + "# data.section_mode_argmax.isin(['train', 'bus']), 'section_mode_argmax'\n", + "# ] = 'transit'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display(data.section_mode_argmax.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# transit = data.loc[data.section_mode_argmax == 'transit', :].copy()\n", + "# transit['section_duration_argmax'] /= 60.\n", + "\n", + "# transit['mph'] = transit['section_distance_argmax']/transit['section_duration_argmax']\n", + "\n", + "# display(transit[['section_duration_argmax', 'section_distance_argmax', 'mph']].describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import plotly.express as px\n", + "\n", + "# sp = data.loc[data.section_mode_argmax.isin(['car', 'transit', 'walking']), :]\n", + "# fig = px.line(sp, y='section_distance_argmax', color='section_mode_argmax')\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Close the figure above.\n", + "# plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_duration_estimate(df: pd.DataFrame, dset: SPLIT, model_dict: dict):\n", + " \n", + " X_features = ['section_distance_argmax', 'age']\n", + " \n", + " if 'mph' in df.columns:\n", + " X_features += ['mph']\n", + " \n", + " if dset == SPLIT.TRAIN and model_dict is None:\n", + " model_dict = dict()\n", + " \n", + " if dset == SPLIT.TEST and model_dict is None:\n", + " raise AttributeError(\"Expected model dict for testing.\")\n", + " \n", + " if dset == SPLIT.TRAIN:\n", + " for section_mode in df.section_mode_argmax.unique():\n", + " section_data = df.loc[df.section_mode_argmax == section_mode, :]\n", + " if section_mode not in model_dict:\n", + " model_dict[section_mode] = dict()\n", + "\n", + " model = LinearRegression(fit_intercept=True)\n", + "\n", + " X = section_data[\n", + " X_features\n", + " ]\n", + " Y = section_data[['section_duration_argmax']]\n", + "\n", + " model.fit(X, Y.values.ravel())\n", + "\n", + " r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel())\n", + " print(f\"Train R2 for {section_mode}: {r2}\")\n", + "\n", + " model_dict[section_mode]['model'] = model\n", + " \n", + " elif dset == SPLIT.TEST:\n", + " for section_mode in df.section_mode_argmax.unique():\n", + " section_data = df.loc[df.section_mode_argmax == section_mode, :]\n", + " X = section_data[\n", + " X_features\n", + " ]\n", + " Y = section_data[['section_duration_argmax']]\n", + " \n", + " y_pred = model_dict[section_mode]['model'].predict(X)\n", + " r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel())\n", + " print(f\"Test R2 for {section_mode}: {r2}\")\n", + " \n", + " # Create the new columns for the duration.\n", + " new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown']\n", + " df[new_columns] = 0\n", + " df['temp'] = 0\n", + " \n", + " for section in df.section_mode_argmax.unique():\n", + " X_section = df.loc[df.section_mode_argmax == section, X_features]\n", + " \n", + " # broadcast to all columns.\n", + " df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section)\n", + " \n", + " for c in new_columns:\n", + " df[c] = df['av_' + c] * df['temp']\n", + " \n", + " df.drop(columns=['temp'], inplace=True)\n", + " \n", + " df.rename(columns=dict([(x, 'tt_'+x) for x in new_columns]), inplace=True)\n", + " \n", + " # return model_dict, result_df\n", + " return model_dict, df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, we split the data.\n", + "train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTER_USER)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If split is inter-user, we should verify test size.\n", + "\n", + "n_tr, n_te = len(train_data.user_id.unique()), len(test_data.user_id.unique())\n", + "n_ex_tr, n_ex_te = train_data.shape[0], test_data.shape[0]\n", + "\n", + "print(n_tr/(n_tr+n_te))\n", + "print(n_ex_tr/(n_ex_tr+n_ex_te))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(train_data.columns.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params, train_data = get_duration_estimate(train_data, SPLIT.TRAIN, None)\n", + "print(10 * \"-\")\n", + "_, test_data = get_duration_estimate(test_data, SPLIT.TEST, params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.shape, test_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Some helper functions that will help ease redundancy in the code.\n", + "\n", + "def drop_columns(df: pd.DataFrame):\n", + " to_drop = [\n", + " 'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', \n", + " 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', \n", + " 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation',\n", + " 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', \n", + " 'start:day', 'start_local_dt_minute', 'start_local_dt_second', \n", + " 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', \n", + " 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', \n", + " 'end_local_dt_timezone', '_id', 'user_id', 'metadata_write_ts', 'additions', \n", + " 'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', \n", + " 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', \n", + " 'available_modes', 'section_coordinates_argmax', 'section_mode_argmax'\n", + " ]\n", + " \n", + " # Drop section_mode_argmax and available_modes.\n", + " return df.drop(\n", + " columns=to_drop, \n", + " inplace=False\n", + " )\n", + "\n", + "\n", + "def scale_values(df: pd.DataFrame, split: SPLIT, scalers=None):\n", + " # Scale costs using StandardScaler.\n", + " costs = df[[c for c in df.columns if 'cost_' in c]].copy()\n", + " times = df[[c for c in df.columns if 'tt_' in c or 'duration' in c]].copy()\n", + " distances = df[[c for c in df.columns if 'distance' in c]]\n", + " \n", + " print(\n", + " \"Cost columns to be scaled: \", costs.columns,\"\\nTime columns to be scaled: \", times.columns, \\\n", + " \"\\nDistance columns to be scaled: \", distances.columns\n", + " )\n", + " \n", + " if split == SPLIT.TRAIN and scalers is None:\n", + " cost_scaler = StandardScaler()\n", + " tt_scaler = StandardScaler()\n", + " dist_scaler = StandardScaler()\n", + " \n", + " cost_scaled = pd.DataFrame(\n", + " cost_scaler.fit_transform(costs), \n", + " columns=costs.columns, \n", + " index=costs.index\n", + " )\n", + " \n", + " tt_scaled = pd.DataFrame(\n", + " tt_scaler.fit_transform(times),\n", + " columns=times.columns,\n", + " index=times.index\n", + " )\n", + " \n", + " dist_scaled = pd.DataFrame(\n", + " dist_scaler.fit_transform(distances),\n", + " columns=distances.columns,\n", + " index=distances.index\n", + " )\n", + " \n", + " elif split == SPLIT.TEST and scalers is not None:\n", + " \n", + " cost_scaler, tt_scaler, dist_scaler = scalers\n", + " \n", + " cost_scaled = pd.DataFrame(\n", + " cost_scaler.transform(costs), \n", + " columns=costs.columns, \n", + " index=costs.index\n", + " )\n", + " \n", + " tt_scaled = pd.DataFrame(\n", + " tt_scaler.transform(times), \n", + " columns=times.columns, \n", + " index=times.index\n", + " )\n", + " \n", + " dist_scaled = pd.DataFrame(\n", + " dist_scaler.transform(distances),\n", + " columns=distances.columns,\n", + " index=distances.index\n", + " )\n", + " \n", + " else:\n", + " raise NotImplementedError(\"Unknown split\")\n", + " \n", + " # Drop the original columns.\n", + " df.drop(\n", + " columns=costs.columns.tolist() + times.columns.tolist() + distances.columns.tolist(), \n", + " inplace=True\n", + " )\n", + " \n", + " df = df.merge(right=cost_scaled, left_index=True, right_index=True)\n", + " df = df.merge(right=tt_scaled, left_index=True, right_index=True)\n", + " df = df.merge(right=dist_scaled, left_index=True, right_index=True)\n", + " \n", + " return df, (cost_scaler, tt_scaler, dist_scaler)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First, drop columns.\n", + "\n", + "train_data = drop_columns(train_data)\n", + "\n", + "# Scale cost.\n", + "# train_data, scalers = scale_values(train_data, SPLIT.TRAIN, None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = drop_columns(test_data)\n", + "\n", + "# Scale cost.\n", + "# test_data, _ = scale_values(test_data, SPLIT.TEST, scalers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(train_data.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(train_data.target.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# train_data.to_csv('../data/train.csv', index=False)\n", + "# test_data.to_csv('../data/test.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report\n", + "from sklearn.model_selection import GridSearchCV, StratifiedKFold\n", + "from pprint import pprint\n", + "from sklearn.inspection import permutation_importance\n", + "from time import perf_counter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forest classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CV = False\n", + "SAVE_MODEL = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# exp question - compute sample weights using user_id.\n", + "\n", + "rf_train = train_data.drop(columns=['target', \n", + " 'start_lat', 'start_lng', 'end_lat', 'end_lng'\n", + " ])\n", + "rf_test = test_data.drop(columns=['target', \n", + " 'start_lat', 'start_lng', 'end_lat', 'end_lng'\n", + " ])\n", + "\n", + "if CV:\n", + "\n", + " model = RandomForestClassifier(random_state=SEED)\n", + "\n", + " # We want to build bootstrapped trees that would not always use all the features.\n", + "\n", + " param_set2 = {\n", + " 'n_estimators': [150, 200, 250, 300],\n", + " 'min_samples_split': [2, 3, 4],\n", + " 'min_samples_leaf': [1, 2, 3],\n", + " 'class_weight': ['balanced_subsample'],\n", + " 'max_features': [None, 'sqrt'],\n", + " 'bootstrap': [True]\n", + " }\n", + "\n", + " cv_set2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)\n", + "\n", + " clf_set2 = GridSearchCV(model, param_set2, cv=cv_set2, n_jobs=-1, scoring='f1_weighted', verbose=1)\n", + "\n", + " start = perf_counter()\n", + "\n", + " clf_set2.fit(\n", + " rf_train,\n", + " train_data.target.values.ravel()\n", + " )\n", + "\n", + " time_req = (perf_counter() - start)/60.\n", + "\n", + " best_model = clf_set2.best_estimator_\n", + "else:\n", + " best_model = RandomForestClassifier(\n", + " n_estimators=150,\n", + " max_depth=None,\n", + " min_samples_leaf=2,\n", + " bootstrap=True,\n", + " class_weight='balanced_subsample',\n", + " random_state=SEED,\n", + " n_jobs=-1\n", + " ).fit(rf_train, train_data.target.values.ravel())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tr_f1_set2 = f1_score(\n", + " y_true=train_data.target.values,\n", + " y_pred=best_model.predict(rf_train),\n", + " average='weighted'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "te_f1_set2 = f1_score(\n", + " y_true=test_data.target.values,\n", + " y_pred=best_model.predict(rf_test),\n", + " average='weighted'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Without location:\n", + "#. intra-user split:\n", + "# [BOOTSTRAPPED] | Train F1: 0.9983454261487021, Test F1: 0.7192048995905516\n", + "# if stratified by section_mode_argmax:\n", + "# [BOOTSTRAPPED] | Train F1: 0.9987250576328509, Test F1: 0.7242573620109232\n", + "\n", + "# With location:\n", + "# [BOOTSTRAPPED] | Train F1: 0.9992402006853468, Test F1: 0.7654135199070202\n", + "\n", + "print(f\"[BOOTSTRAPPED] | Train F1: {tr_f1_set2}, Test F1: {te_f1_set2}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if SAVE_MODEL:\n", + "\n", + " import pickle\n", + "\n", + " with open('../models/tuned_rf_model.pkl', 'wb') as f:\n", + " f.write(pickle.dumps(best_model))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature importances - gini entropy\n", + "\n", + "pprint(\n", + " sorted(\n", + " zip(\n", + " best_model.feature_names_in_, \n", + " best_model.feature_importances_\n", + " ), \n", + " key=lambda x: x[-1], reverse=True\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# importance = permutation_importance(\n", + "# best_model,\n", + "# rf_test,\n", + "# test_data.target.values,\n", + "# n_repeats=5,\n", + "# random_state=SEED,\n", + "# n_jobs=-1,\n", + "# scoring='f1_weighted'\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pd.DataFrame(\n", + "# {\n", + "# 'feature names': test_data.columns.delete(\n", + "# test_data.columns.isin(['target'])\n", + "# ),\n", + "# 'imp_mean': importance.importances_mean, \n", + "# 'imp_std': importance.importances_std\n", + "# }\n", + "# ).sort_values(by=['imp_mean'], axis='rows', ascending=False).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots(nrows=1, ncols=2)\n", + "y_pred = best_model.predict(rf_test)\n", + "pred_df = pd.DataFrame(\n", + " {\n", + " 'y_pred': y_pred.ravel(),\n", + " 'y_true': test_data.target.values.ravel()\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(7, 7))\n", + "cm = ConfusionMatrixDisplay.from_estimator(\n", + " best_model,\n", + " X=rf_test,\n", + " y=test_data[['target']],\n", + " ax=ax\n", + ")\n", + "# ax.set_xticklabels(TARGETS, rotation=45)\n", + "# ax.set_yticklabels(TARGETS)\n", + "fig.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print(classification_report(y_true=pred_df.y_true, y_pred=pred_df.y_pred, target_names=TARGETS))\n", + "print(classification_report(y_true=pred_df.y_true, y_pred=pred_df.y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from sklearn.utils.class_weight import compute_sample_weight\n", + "\n", + "# sample_weights = compute_sample_weight(class_weight='balanced', y=train_data.user_id.values.ravel())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# from xgboost import XGBClassifier\n", + "\n", + "# y_train = train_data.target.values.ravel() - 1\n", + "# y_test = test_data.target.values.ravel() - 1\n", + "\n", + "# # weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_pred), y_pred)\n", + "\n", + "# xgm = XGBClassifier(\n", + "# n_estimators=300,\n", + "# max_depth=None,\n", + "# tree_method='hist',\n", + "# objective='multi:softmax',\n", + "# num_class=9\n", + "# ).fit(rf_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# preds = xgm.predict(rf_test)\n", + "\n", + "# print(classification_report(y_true=y_test, y_pred=preds, target_names=TARGETS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import pickle\n", + "\n", + "# # RF_RM.pkl = 0.8625 on test.\n", + "# # RF_RM_1.pkl = 0.77 on test.\n", + "# with open('../models/RF_RM_1.pkl', 'wb') as f:\n", + "# f.write(pickle.dumps(model))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TODO:\n", + "\n", + "\n", + "- Explain why location might not be a good feature to add (plot start and end on map and explain how model might just overfit to the raw coordinates)\n", + "- Merge `unknown` and `no_trip` into one category and validate against models trained on (a) separate labels (b) dropped labels\n", + "- Explore more of the abnormal `walking` trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "ab0c6e94c9422d07d42069ec9e3bb23090f5e156fc0e23cc25ca45a62375bf53" + }, + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/replacement_mode_modeling/experimental_notebooks/biogeme_modeling train_test_w_splits.ipynb b/replacement_mode_modeling/experimental_notebooks/biogeme_modeling train_test_w_splits.ipynb new file mode 100644 index 0000000..5cc4f68 --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/biogeme_modeling train_test_w_splits.ipynb @@ -0,0 +1,1107 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Biogeme modeling for inter-user modeling. Contains outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from enum import Enum\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class SPLIT_TYPE(Enum):\n", + " INTRA_USER = 0\n", + " INTER_USER = 1\n", + " \n", + "\n", + "class SPLIT(Enum):\n", + " TRAIN = 0\n", + " TEST = 1\n", + "\n", + "\n", + "def get_splits(count_df: pd.DataFrame, n:int, test_size=0.2):\n", + " maxsize = int(n * test_size)\n", + "\n", + " max_threshold = int(maxsize * 1.05)\n", + " min_threshold = int(maxsize * 0.95)\n", + "\n", + " print(f\"{min_threshold=}, {max_threshold=}\")\n", + " \n", + " # Allow a 10% tolerance\n", + " def _dp(ix, curr_size, ids, cache):\n", + " \n", + " if ix >= count_df.shape[0]:\n", + " return []\n", + "\n", + " key = ix\n", + "\n", + " if key in cache:\n", + " return cache[key]\n", + "\n", + " if curr_size > max_threshold:\n", + " return []\n", + "\n", + " if min_threshold <= curr_size <= max_threshold:\n", + " return ids\n", + "\n", + " # two options - either pick the current id or skip it.\n", + " branch_a = _dp(ix, curr_size+count_df.loc[ix, 'count'], ids+[count_df.loc[ix, 'index']], cache)\n", + " branch_b = _dp(ix+1, curr_size, ids, cache)\n", + " \n", + " curr_max = []\n", + " if branch_a and len(branch_a) > 0:\n", + " curr_max = branch_a\n", + " \n", + " if branch_b and len(branch_b) > len(branch_a):\n", + " curr_max = branch_b\n", + " \n", + " cache[key] = curr_max\n", + " return cache[key]\n", + " \n", + " return _dp(0, 0, ids=list(), cache=dict())\n", + "\n", + "\n", + "def get_train_test_splits(data: pd.DataFrame, how=SPLIT_TYPE, test_ratio=0.2, shuffle=True):\n", + "\n", + " n_users = list(data.user_id.unique())\n", + " n = data.shape[0]\n", + " \n", + " if shuffle:\n", + " data = data.sample(data.shape[0]).reset_index(drop=True, inplace=False)\n", + "\n", + " if how == SPLIT_TYPE.INTER_USER:\n", + " # Make the split, ensuring that a user in one fold is not leaked into the other fold.\n", + " # Basic idea: we want to start with the users with the highest instances and place alternating users in each set.\n", + " counts = data.user_id.value_counts().reset_index(drop=False, inplace=False, name='count')\n", + "\n", + " # Now, start with the user_id at the top, and keep adding to either split.\n", + " # This can be achieved using a simple DP program.\n", + " test_ids = get_splits(counts, data.shape[0])\n", + " test_data = data.loc[data.user_id.isin(test_ids), :]\n", + " train_index = data.index.difference(test_data.index)\n", + " train_data = data.loc[data.user_id.isin(train_index), :]\n", + " \n", + " return train_data, test_data\n", + " \n", + " elif how == SPLIT_TYPE.INTRA_USER:\n", + " \n", + " # There are certain users with only one observation. What do we do with those?\n", + " # As per the mobilitynet modeling pipeline, we randomly assign them to either the\n", + " # training or test set.\n", + " \n", + " value_counts = data.user_id.value_counts()\n", + " single_count_ids = value_counts[value_counts == 1].index\n", + " \n", + " data_filtered = data.loc[~data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " data_single_counts = data.loc[data.user_id.isin(single_count_ids), :].reset_index(drop=True)\n", + " \n", + " X_tr, X_te = train_test_split(\n", + " data_filtered, test_size=test_ratio, shuffle=shuffle, stratify=data_filtered.user_id\n", + " )\n", + " \n", + " data_single_counts['assigned'] = np.random.choice(['train', 'test'], len(data_single_counts))\n", + " X_tr_merged = pd.concat(\n", + " [X_tr, data_single_counts.loc[data_single_counts.assigned == 'train', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " X_te_merged = pd.concat(\n", + " [X_te, data_single_counts.loc[data_single_counts.assigned == 'test', :].drop(\n", + " columns=['assigned'], inplace=False\n", + " )],\n", + " ignore_index=True, axis=0\n", + " )\n", + " \n", + " return X_tr_merged, X_te_merged\n", + " \n", + " raise NotImplementedError(\"Unknown split type\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import biogeme.biogeme as bio\n", + "import biogeme.database as db\n", + "from biogeme import models\n", + "from biogeme.expressions import Beta, DefineVariable\n", + "from biogeme.expressions import Variable\n", + "import numpy as np\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n_rows: 164281\n" + ] + } + ], + "source": [ + "# Read the data.\n", + "data = pd.read_csv('../data/preprocessed_data_split_chosen.csv')\n", + "\n", + "print(\"n_rows: \", data.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# First, we map the user IDs to ints.\n", + "\n", + "USER_MAP = {\n", + " u: i+1 for (i, u) in enumerate(data.user_id.unique())\n", + "}\n", + "\n", + "data['user_id'] = data['user_id'].apply(lambda x: USER_MAP[x])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, we split the data (either inter-user or intra-user split)\n", + "\n", + "# 0.98???\n", + "# train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTER_USER, shuffle=True)\n", + "\n", + "# 0.975???\n", + "train_data, test_data = get_train_test_splits(data=data, how=SPLIT_TYPE.INTRA_USER, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Some helper functions that will help ease redundancy in the code.\n", + "\n", + "def drop_columns(df: pd.DataFrame):\n", + " # Drop section_mode_argmax and available_modes.\n", + " return df.drop(columns=[\n", + " 'section_mode_argmax', 'available_modes', 'section_duration_argmax', 'section_distance_argmax'\n", + " ], inplace=False)\n", + "\n", + "\n", + "def scale_time(df: pd.DataFrame):\n", + " # Convert from min -> hrs\n", + " df[[c for c in df.columns if 'tt_' in c]] /= 60.\n", + " return df\n", + "\n", + "\n", + "def scale_cost(df: pd.DataFrame, split: SPLIT, scaler=None):\n", + " # Scale costs using MinMaxScaler.\n", + " costs = df[[c for c in df.columns if 'cost_' in c]].copy()\n", + " \n", + " if split == SPLIT.TRAIN and scaler is None:\n", + " scaler = MinMaxScaler()\n", + " cost_scaled = pd.DataFrame(\n", + " scaler.fit_transform(costs), \n", + " columns=['scaled_' + c for c in costs.columns], \n", + " index=costs.index\n", + " )\n", + " \n", + " elif split == SPLIT.TEST and scaler is not None:\n", + " cost_scaled = pd.DataFrame(\n", + " scaler.transform(costs), \n", + " columns=['scaled_' + c for c in costs.columns], \n", + " index=costs.index\n", + " )\n", + " \n", + " else:\n", + " raise NotImplementedError(\"Unknown split\")\n", + " \n", + " df = df.merge(right=cost_scaled, left_index=True, right_index=True)\n", + " \n", + " return df, scaler\n", + "\n", + "\n", + "def get_database(df: pd.DataFrame, split: SPLIT):\n", + " return db.Database(split.name + '_db', df)\n", + "\n", + "\n", + "def get_variables():\n", + " USER_ID = Variable('user_id')\n", + "\n", + " # Availability.\n", + " AV_P_MICRO = Variable('av_p_micro')\n", + " AV_NO_TRIP = Variable('av_no_trip')\n", + " AV_S_CAR = Variable('av_s_car')\n", + " AV_TRANSIT = Variable('av_transit')\n", + " AV_CAR = Variable('av_car')\n", + " AV_S_MICRO = Variable('av_s_micro')\n", + " AV_RIDEHAIL = Variable('av_ridehail')\n", + " AV_WALK = Variable('av_walk')\n", + " AV_UNKNOWN = Variable('av_unknown')\n", + "\n", + " # Time.\n", + " TT_P_MICRO = Variable('tt_p_micro')\n", + " TT_NO_TRIP = Variable('tt_no_trip')\n", + " TT_S_CAR = Variable('tt_s_car')\n", + " TT_TRANSIT = Variable('tt_transit')\n", + " TT_CAR = Variable('tt_car')\n", + " TT_S_MICRO = Variable('tt_s_micro')\n", + " TT_RIDEHAIL = Variable('tt_ridehail')\n", + " TT_WALK = Variable('tt_walk')\n", + " TT_UNKNOWN = Variable('tt_unknown')\n", + "\n", + " # Cost.\n", + " CO_P_MICRO = Variable('scaled_cost_p_micro')\n", + " CO_NO_TRIP = Variable('scaled_cost_no_trip')\n", + " CO_S_CAR = Variable('scaled_cost_s_car')\n", + " CO_TRANSIT = Variable('scaled_cost_transit')\n", + " CO_CAR = Variable('scaled_cost_car')\n", + " CO_S_MICRO = Variable('scaled_cost_s_micro')\n", + " CO_RIDEHAIL = Variable('scaled_cost_ridehail')\n", + " CO_WALK = Variable('scaled_cost_walk')\n", + " CO_UNKNOWN = Variable('scaled_cost_unknown')\n", + "\n", + " # Choice.\n", + " CHOICE = Variable('chosen')\n", + " \n", + " # return the filtered locals() dictionary.\n", + " return {k:v for k,v in locals().items() if not k.startswith('_')}\n", + "\n", + "\n", + "def exclude_from_db(v_dict: dict, db: db.Database):\n", + " EXCLUDE = (v_dict['CHOICE'] == 2) + (v_dict['CHOICE'] == 9) > 0\n", + " db.remove(EXCLUDE)\n", + "\n", + "\n", + "def get_params():\n", + " B_TIME = Beta('B_TIME', 0, None, 0, 0)\n", + " B_COST = Beta('B_COST', 0, None, None, 0)\n", + "\n", + " # Alternative-Specific Constants.\n", + " ASC_P_MICRO = Beta('ASC_P_MICRO', 0, None, None, 0)\n", + " ASC_NO_TRIP = Beta('ASC_NO_TRIP', 0, None, None, 0)\n", + " ASC_S_CAR = Beta('ASC_S_CAR', 0, None, None, 0)\n", + " ASC_TRANSIT = Beta('ASC_TRANSIT', 0, None, None, 0)\n", + " ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)\n", + " ASC_S_MICRO = Beta('ASC_S_MICRO', 0, None, None, 0)\n", + " ASC_RIDEHAIL = Beta('ASC_RIDEHAIL', 0, None, None, 0)\n", + " ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)\n", + " ASC_UNKNOWN = Beta('ASC_UNKNOWN', 0, None, None, 0)\n", + " \n", + " # Return filtered locals dict.\n", + " return {k:v for k,v in locals().items() if not k.startswith('_')}\n", + "\n", + "\n", + "def get_utility_functions(v: dict):\n", + " V_P_MICRO = (\n", + " v['ASC_P_MICRO'] +\n", + " v['B_TIME'] * v['TT_P_MICRO']\n", + " + v['B_COST'] * v['CO_P_MICRO']\n", + " )\n", + "\n", + " V_NO_TRIP = (\n", + " v['ASC_NO_TRIP'] +\n", + " v['B_TIME'] * v['TT_NO_TRIP'] +\n", + " v['B_COST'] * v['CO_NO_TRIP']\n", + " )\n", + "\n", + " V_S_CAR = (\n", + " v['ASC_S_CAR'] +\n", + " v['B_TIME'] * v['TT_S_CAR'] +\n", + " v['B_COST'] * v['CO_S_CAR']\n", + " )\n", + "\n", + " V_TRANSIT = (\n", + " v['ASC_TRANSIT'] +\n", + " v['B_TIME'] * v['TT_TRANSIT'] +\n", + " v['B_COST'] * v['CO_TRANSIT']\n", + " )\n", + "\n", + " V_CAR = (\n", + " v['ASC_CAR'] +\n", + " v['B_TIME'] * v['TT_CAR'] +\n", + " v['B_COST'] * v['CO_CAR']\n", + " )\n", + "\n", + " V_S_MICRO = (\n", + " v['ASC_S_MICRO'] +\n", + " v['B_TIME'] * v['TT_S_MICRO'] +\n", + " v['B_COST'] * v['CO_S_MICRO']\n", + " )\n", + "\n", + " V_RIDEHAIL = (\n", + " v['ASC_RIDEHAIL'] +\n", + " v['B_TIME'] * v['TT_RIDEHAIL'] +\n", + " v['B_COST'] * v['CO_RIDEHAIL']\n", + " )\n", + "\n", + " V_WALK = (\n", + " v['ASC_WALK'] +\n", + " v['B_TIME'] * v['TT_WALK']\n", + " + v['B_COST'] * v['CO_WALK']\n", + " )\n", + "\n", + " V_UNKNOWN = (\n", + " v['ASC_UNKNOWN'] +\n", + " v['B_TIME'] * v['TT_UNKNOWN'] +\n", + " v['B_COST'] * v['CO_UNKNOWN']\n", + " )\n", + " \n", + " # Remember to exclude the input argument.\n", + " return {k:v for k,v in locals().items() if not k.startswith('_') and k != 'v'}\n", + "\n", + "\n", + "def get_utility_mapping(var: dict):\n", + " # Map alterative to utility functions.\n", + " return {\n", + " 1: var['V_P_MICRO'], \n", + " 2: var['V_NO_TRIP'],\n", + " 3: var['V_S_CAR'], \n", + " 4: var['V_TRANSIT'],\n", + " 5: var['V_CAR'], \n", + " 6: var['V_S_MICRO'],\n", + " 7: var['V_RIDEHAIL'], \n", + " 8: var['V_WALK'], \n", + " 9: var['V_UNKNOWN']\n", + " }\n", + "\n", + "\n", + "def get_availability_mapping(var: dict):\n", + " return {\n", + " 1: var['AV_P_MICRO'],\n", + " 2: var['AV_NO_TRIP'],\n", + " 3: var['AV_S_CAR'],\n", + " 4: var['AV_TRANSIT'],\n", + " 5: var['AV_CAR'],\n", + " 6: var['AV_S_MICRO'],\n", + " 7: var['AV_RIDEHAIL'],\n", + " 8: var['AV_WALK'],\n", + " 9: var['AV_UNKNOWN']\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# First, drop columns.\n", + "\n", + "train_data = drop_columns(train_data)\n", + "\n", + "# Next, scale time.\n", + "train_data = scale_time(train_data)\n", + "\n", + "# Scale cost.\n", + "train_data, scaler = scale_cost(train_data, SPLIT.TRAIN, None)\n", + "\n", + "# get dbs.\n", + "train_db = get_database(train_data, SPLIT.TRAIN)\n", + "\n", + "# get vars.\n", + "train_vars = get_variables()\n", + "\n", + "# exclude wrong points.\n", + "exclude_from_db(train_vars, train_db)\n", + "\n", + "train_params = get_params()\n", + "train_vars.update(train_params)\n", + "\n", + "train_V = get_utility_functions(train_vars)\n", + "train_vars.update(train_V)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "V = get_utility_mapping(train_vars)\n", + "av = get_availability_mapping(train_vars)\n", + "train_logprob = models.loglogit(V, av, train_vars['CHOICE'])\n", + "\n", + "model = bio.BIOGEME(train_db, train_logprob)\n", + "model.modelName = 'splitChoiceModel'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_results = model.estimate()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Results for model splitChoiceModel\n", + "Nbr of parameters:\t\t11\n", + "Sample size:\t\t\t129291\n", + "Excluded data:\t\t\t2133\n", + "Final log likelihood:\t\t-0.07647159\n", + "Akaike Information Criterion:\t22.15294\n", + "Bayesian Information Criterion:\t129.621\n", + "\n" + ] + } + ], + "source": [ + "print(train_results.short_summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ValueRob. Std errRob. t-testRob. p-value
ASC_CAR88.7937272.832838e-023.134444e+030.0
ASC_NO_TRIP-623.2883331.797693e+308-3.467156e-3061.0
ASC_P_MICRO83.2864685.705798e-021.459681e+030.0
ASC_RIDEHAIL88.7602042.370587e-023.744230e+030.0
ASC_S_CAR88.8067072.278816e-023.897055e+030.0
ASC_S_MICRO85.1029973.619103e-022.351494e+030.0
ASC_TRANSIT85.5809325.083868e-021.683382e+030.0
ASC_UNKNOWN0.0000001.797693e+3080.000000e+001.0
ASC_WALK102.9572971.408207e-017.311235e+020.0
B_COST-2879.4061372.303798e+01-1.249852e+020.0
B_TIME-107.1993082.970404e-01-3.608913e+020.0
\n", + "
" + ], + "text/plain": [ + " Value Rob. Std err Rob. t-test Rob. p-value\n", + "ASC_CAR 88.793727 2.832838e-02 3.134444e+03 0.0\n", + "ASC_NO_TRIP -623.288333 1.797693e+308 -3.467156e-306 1.0\n", + "ASC_P_MICRO 83.286468 5.705798e-02 1.459681e+03 0.0\n", + "ASC_RIDEHAIL 88.760204 2.370587e-02 3.744230e+03 0.0\n", + "ASC_S_CAR 88.806707 2.278816e-02 3.897055e+03 0.0\n", + "ASC_S_MICRO 85.102997 3.619103e-02 2.351494e+03 0.0\n", + "ASC_TRANSIT 85.580932 5.083868e-02 1.683382e+03 0.0\n", + "ASC_UNKNOWN 0.000000 1.797693e+308 0.000000e+00 1.0\n", + "ASC_WALK 102.957297 1.408207e-01 7.311235e+02 0.0\n", + "B_COST -2879.406137 2.303798e+01 -1.249852e+02 0.0\n", + "B_TIME -107.199308 2.970404e-01 -3.608913e+02 0.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(train_results.getEstimatedParameters())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def get_utility_df(results, data):\n", + "\n", + " def compute_utilities(betas, row: pd.Series):\n", + " data = row.to_dict()\n", + "\n", + " utility_p_micro = betas['ASC_P_MICRO'] + (betas['B_TIME'] * data['tt_p_micro'])\n", + " utility_no_trip = betas['ASC_NO_TRIP'] + (betas['B_TIME'] * data['tt_no_trip']) + (betas['B_COST'] * data['scaled_cost_no_trip'])\n", + " utility_s_car = betas['ASC_S_CAR'] + (betas['B_COST'] * data['scaled_cost_s_car']) + (betas['B_TIME'] * data['tt_s_car'])\n", + " utility_transit = betas['ASC_TRANSIT'] + (betas['B_COST'] * data['scaled_cost_transit']) + (betas['B_TIME'] * data['tt_transit'])\n", + " utility_car = betas['ASC_CAR'] + (betas['B_COST'] * data['scaled_cost_car'] + (betas['B_TIME'] * data['tt_car']))\n", + " utility_s_micro = betas['ASC_S_MICRO'] + (betas['B_COST'] * data['scaled_cost_s_micro']) + (betas['B_TIME'] * data['tt_s_micro'])\n", + " utility_ridehail = betas['ASC_RIDEHAIL'] + (betas['B_COST'] * data['scaled_cost_ridehail']) + (betas['B_TIME'] * data['tt_ridehail'])\n", + " utility_walk = betas['ASC_WALK'] + (betas['B_TIME'] * data['tt_walk'])\n", + " utility_unknown = betas['ASC_UNKNOWN'] + (betas['B_TIME'] * data['tt_unknown']) + (betas['B_COST'] * data['scaled_cost_unknown'])\n", + "\n", + " return {\n", + " 'utility_p_micro': utility_p_micro, 'utility_no_trip': utility_no_trip,\n", + " 'utility_s_car': utility_s_car, 'utility_transit': utility_transit,\n", + " 'utility_car': utility_car, 'utility_s_micro': utility_s_micro,\n", + " 'utility_ridehail': utility_ridehail, 'utility_walk': utility_walk, \n", + " 'utility_unknown': utility_unknown, \n", + " }\n", + " \n", + " betas = results.getBetaValues()\n", + "\n", + " u_data = data.apply(lambda x: compute_utilities(betas, x), axis=1).tolist()\n", + " return pd.DataFrame(u_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = drop_columns(test_data)\n", + "\n", + "# Next, scale time.\n", + "test_data = scale_time(test_data)\n", + "\n", + "# Scale cost.\n", + "test_data, _ = scale_cost(test_data, SPLIT.TEST, scaler)\n", + "\n", + "# get dbs.\n", + "test_db = get_database(test_data, SPLIT.TEST)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "test_utilities = get_utility_df(train_results, test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
utility_p_microutility_no_triputility_s_carutility_transitutility_carutility_s_microutility_ridehailutility_walkutility_unknown
067.411217-623.28833363.82887564.73419163.81589564.95953982.76410944.675225-21.398065
1-91.860937-623.288333-71.415959-97.629531-71.428939-188.098667-48.301289-669.366009-119.831550
258.227962-623.28833375.25369255.37268456.01799850.36880355.9844763.505302-27.073506
348.651631-623.28833347.89930445.61047047.88632335.15353067.326805-39.426846-32.991877
471.481893-623.28833367.28546168.88388267.27248171.42719886.11387862.924686-18.882302
\n", + "
" + ], + "text/plain": [ + " utility_p_micro utility_no_trip utility_s_car utility_transit \\\n", + "0 67.411217 -623.288333 63.828875 64.734191 \n", + "1 -91.860937 -623.288333 -71.415959 -97.629531 \n", + "2 58.227962 -623.288333 75.253692 55.372684 \n", + "3 48.651631 -623.288333 47.899304 45.610470 \n", + "4 71.481893 -623.288333 67.285461 68.883882 \n", + "\n", + " utility_car utility_s_micro utility_ridehail utility_walk \\\n", + "0 63.815895 64.959539 82.764109 44.675225 \n", + "1 -71.428939 -188.098667 -48.301289 -669.366009 \n", + "2 56.017998 50.368803 55.984476 3.505302 \n", + "3 47.886323 35.153530 67.326805 -39.426846 \n", + "4 67.272481 71.427198 86.113878 62.924686 \n", + "\n", + " utility_unknown \n", + "0 -21.398065 \n", + "1 -119.831550 \n", + "2 -27.073506 \n", + "3 -32.991877 \n", + "4 -18.882302 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(test_utilities.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
utility_p_microutility_no_triputility_s_carutility_transitutility_carutility_s_microutility_ridehailutility_walkutility_unknown
02.149431e-072.319104e-3075.977672e-091.478107e-085.900582e-091.851711e-089.999997e-012.872153e-175.793521e-46
11.208607e-191.933302e-2509.150116e-113.775868e-229.032113e-111.935398e-611.000000e+001.883732e-2708.606027e-32
24.034777e-084.236948e-3049.999999e-012.321603e-094.426337e-091.558225e-114.280415e-096.919424e-323.629633e-45
37.753087e-091.173968e-3003.653787e-093.704379e-103.606667e-091.064937e-141.000000e+004.339885e-472.704892e-44
44.419870e-078.138306e-3096.651541e-093.289330e-086.565760e-094.184619e-079.999991e-018.493006e-112.516158e-46
\n", + "
" + ], + "text/plain": [ + " utility_p_micro utility_no_trip utility_s_car utility_transit \\\n", + "0 2.149431e-07 2.319104e-307 5.977672e-09 1.478107e-08 \n", + "1 1.208607e-19 1.933302e-250 9.150116e-11 3.775868e-22 \n", + "2 4.034777e-08 4.236948e-304 9.999999e-01 2.321603e-09 \n", + "3 7.753087e-09 1.173968e-300 3.653787e-09 3.704379e-10 \n", + "4 4.419870e-07 8.138306e-309 6.651541e-09 3.289330e-08 \n", + "\n", + " utility_car utility_s_micro utility_ridehail utility_walk \\\n", + "0 5.900582e-09 1.851711e-08 9.999997e-01 2.872153e-17 \n", + "1 9.032113e-11 1.935398e-61 1.000000e+00 1.883732e-270 \n", + "2 4.426337e-09 1.558225e-11 4.280415e-09 6.919424e-32 \n", + "3 3.606667e-09 1.064937e-14 1.000000e+00 4.339885e-47 \n", + "4 6.565760e-09 4.184619e-07 9.999991e-01 8.493006e-11 \n", + "\n", + " utility_unknown \n", + "0 5.793521e-46 \n", + "1 8.606027e-32 \n", + "2 3.629633e-45 \n", + "3 2.704892e-44 \n", + "4 2.516158e-46 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "u_np = test_utilities.values\n", + "choice_df = np.exp(u_np)/np.sum(np.exp(u_np), axis=1, keepdims=True)\n", + "\n", + "choice_df = pd.DataFrame(choice_df, columns=test_utilities.columns)\n", + "display(choice_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 2 3 4 5 6 7 8 9]\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score\n", + "\n", + "y_pred = np.argmax(choice_df.values, axis=1) + 1\n", + "\n", + "print(np.unique(y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9759923080654546\n" + ] + } + ], + "source": [ + "y_true = test_data.chosen\n", + "score = f1_score(y_true, y_pred, average='weighted')\n", + "\n", + "print(score)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))\n", + "\n", + "sns.histplot(y_pred, ax=ax[0])\n", + "sns.histplot(y_true, ax=ax[1])\n", + "\n", + "labels = [\n", + " 'p_micro', \n", + " 'no_trip',\n", + " 's_car', \n", + " 'transit',\n", + " 'car', \n", + " 's_micro',\n", + " 'ridehail', \n", + " 'walk', \n", + " 'unknown'\n", + "]\n", + "\n", + "ax[0].set(\n", + " title='predicted label distribution',\n", + " xlabel='Labels',\n", + " xticks=range(1, 10),\n", + " xticklabels=labels\n", + ")\n", + "\n", + "ax[1].set(\n", + " title='true label distribution',\n", + " xlabel='Labels',\n", + " xticks=range(1, 10),\n", + " xticklabels=labels\n", + ")\n", + "\n", + "ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45)\n", + "ax[1].set_xticklabels(ax[0].get_xticklabels(), rotation=45)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "ab0c6e94c9422d07d42069ec9e3bb23090f5e156fc0e23cc25ca45a62375bf53" + }, + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/replacement_mode_modeling/experimental_notebooks/optimal_interuser_splits.ipynb b/replacement_mode_modeling/experimental_notebooks/optimal_interuser_splits.ipynb new file mode 100644 index 0000000..4782f9e --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/optimal_interuser_splits.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('../data/filtered_data/preprocessed_data_openpath_prod_uprm_nicr.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1001, 52)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df.section_mode_argmax.value_counts() < 2).any()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import random\n", + "from scipy.special import kl_div\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4x/l9lw50rn7qvf79m01f21x70mlpd6gh/T/ipykernel_85321/3793645385.py:1: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv')\n" + ] + } + ], + "source": [ + "data = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv')\n", + "data.drop_duplicates(inplace=True)\n", + "\n", + "# data.sample(data.shape[0], random_state=SEED).reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ideal_tr, ideal_te = train_test_split(data, test_size=0.2, stratify=data.target, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ideal KL: 3.704099704742548e-08\n" + ] + } + ], + "source": [ + "print(f\"Ideal KL: {kl_div(ideal_tr.target.value_counts(normalize=True), ideal_te.target.value_counts(normalize=True)).mean()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0025" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "2.5e-3" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_optimal_interuser_splits(data: pd.DataFrame, threshold=2.5e-3, maxiters=5000):\n", + " \n", + " ids = data.user_id.unique().tolist()\n", + "\n", + " best_kl = np.inf\n", + " ix = 0\n", + " best_train_ids = None\n", + "\n", + " try:\n", + " while True:\n", + "\n", + " if ix == maxiters:\n", + " break\n", + "\n", + " train_id, test_id = train_test_split(ids, test_size=0.2, shuffle=True)\n", + " train = data.loc[data.user_id.isin(train_id), :]\n", + " test = data.loc[data.user_id.isin(test_id), :]\n", + "\n", + " kl1 = kl_div(\n", + " train.section_mode_argmax.value_counts(normalize=True), \n", + " test.section_mode_argmax.value_counts(normalize=True)\n", + " ).mean()\n", + " \n", + " kl2 = kl_div(\n", + " train.target.value_counts(normalize=True), \n", + " test.target.value_counts(normalize=True)\n", + " ).mean()\n", + " \n", + " kl = kl1 + kl2 \n", + " \n", + " if kl < best_kl:\n", + " best_kl = kl\n", + " # No need to save test because test will be a complement of train.\n", + " best_train_ids = train_id\n", + " print(f'\\t\\t-> Best KL: {best_kl}')\n", + "\n", + " ix += 1\n", + "\n", + " if kl < threshold:\n", + " break\n", + "\n", + " except KeyboardInterrupt:\n", + " print(\"Stopped iterations. Best KL till now: \", best_kl)\n", + " \n", + " finally:\n", + " return best_train_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t\t-> Best KL: 0.019654163171699848\n", + "\t\t-> Best KL: 0.00698817617574597\n", + "\t\t-> Best KL: 0.005533063761614154\n", + "\t\t-> Best KL: 0.003655132674484631\n", + "\t\t-> Best KL: 0.002547459671179468\n", + "\t\t-> Best KL: 0.0022263571393444375\n" + ] + } + ], + "source": [ + "best_train = get_optimal_interuser_splits(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test_data = data.loc[data.user_id.isin(best_train), :]\n", + "train_data = data.loc[~data.user_id.isin(best_train), :]\n", + "\n", + "fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(9, 6))\n", + "sns.histplot(train_data.section_mode_argmax, ax=ax[0][0], discrete=True).set_yscale('log')\n", + "sns.histplot(test_data.section_mode_argmax, ax=ax[0][1], discrete=True).set_yscale('log')\n", + "sns.histplot(train_data.target, ax=ax[1][0], discrete=True).set_yscale('log')\n", + "sns.histplot(test_data.target, ax=ax[1][1], discrete=True).set_yscale('log')\n", + "fig.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['7559c3f880f341e898a402eba96a855d', '1635c003b1f94a399ebebe21640ffced', '6656c04c6cba4c189fed805eaa529741', '4baf8c8af7b7445e9067854065e3e612', '42b3ee0bc02a481ab1a94644a8cd7a0d', 'f3a33641ffb6478f901350c55b6385f8', '14103cda12c94642974129989d39e50d', 'd7a732f4a8644bcbb8dedfc8be242fb2', '509b909390934e988eb120b58ed9bd8c', '3701bb586bf24d0caee8dd1d1421bb15', '802667b6371f45b29c7abb051244836a', 'd3dff742d07942ca805c2f72e49e12c5', 'feb6a3a8a2ef4f4a8754bd79f7154495', 'feb1d940cd3647d1a101580c2a3b3f8c', '90480ac60a3d475a88fbdab0a003dd5d', '3d981e617b304afab0f21ce8aa6c9786', 'c6e4db31c18b4355b02a7dd97deca70b', '8fdc9b926a674a9ea07d91df2c5e06f2', 'b41dd7d7c6d94fe6afe2fd26fa4ac0bd', 'e049a7b2a6cb44259f907abbb44c5abc', '3f7f2e536ba9481e92f8379b796ad1d0', '41c1182a404540a3820dff7de1c3d0e7', 'add706b73839413da13344c355dde0bb', 'fc51d1258e4649ecbfb0e6ecdaeca454', 'f446bf3102ff4bd99ea1c98f7d2f7af0', '840297ae39484e26bfebe83ee30c5b3e', 'dc1ed4d71e3645d0993885398d5628ca', 'ece8b0a509534e98a0d369f25de4a206', '43932257834649c29c5b9ccdc2416ebb', 'baa4ff1573ae411183e10aeb17c71c53', '53e78583db87421f8decb529ba859ca4', '8461560f8b4a4ca6af2cb569962dae32', 'feabfccddd6c4e8e85179d7177042483', '7abe572148864412a33979592fa985fb', 'b346b83b9f7c4536b809d5f92074fdae', '0eb313ab00e6469da78cc2d2e94660fb', '8d0bfee173d9428bae97f609e50d5570', 'eec6936e1ac347ef9365881845ec74df', '8b0876430c2641bcaea954ea00520e64', 'e4cfb2a8f600426897569985e234636e', '8c7d261fe8284a42a777ffa6f380ba3b', '234f4f2366244fe682dccded2fa7cc4e', 'cde34edb8e3a4278a18e0adb062999e5', 'a0c3a7b410b24e18995f63369a31d123', '8a8332a53a1b4cdd9f3680434e91a6ef', 'a1954793b1454b2f8cf95917d7547169', 'd3735ba212dd4c768e1675dca7bdcb6f', '4b89612d7f1f4b368635c2bc48bd7993', 'fc68a5bb0a7b4b6386b3f08a69ead36f', 'd68a36934a2649278fb6d084e1d992de', '313d003df34b4bd9823b3474fc93f9f9', '85ddd3c34c58407392953c47a32f5428', '8c3c63abb3ec4fc3a61e7bf316ee4efd', '92bde0d0662f45ac864629f486cffe77', '39db1e03b46c43129aa8dbe3bbe16687', '0b3e78fa91d84aa6a3203440143c8c16', '14fe8002bbdc4f97acbd1a00de241bf6', '1b7d6dfea8464bcab9321018b10ec9c9', '2455a5992b174239a1c926a7de96d623', 'f0db3b1999c2410ba5933103eca9212f', '93c6e0f156a44e07b920ded664419dc6', '5ad862e79a6341f69f28c0096fe884da', '703a9cee8315441faff7eb63f2bfa93f', '405b221abe9e43bc86a57ca7fccf2227', '3f067105255e4b0ca1bab377fee7ef16', '44c10f66dec244d6b8644231d4a8fecb', 'a231added8674bef95092b32bc254ac8', 'e66e63b206784a559d977d4cb5f1ec34', '9910245fee4e4ccaab4cdd2312eb0d5d', '3dddfb70e7cd40f18a63478654182e9a', '91f3ca1c278247f79a806e49e9cc236f', '6373dfb8cb9b47e88e8f76adcfadde20', '26767f9f3da54e93b692f8be6acdac43', '3b25446778824941a4c70ae5774f4c68', 'bd9cffc8dbf1402da479f9f148ec9e60', '6d96909e5ca442ccb5679d9cdf3c8f5b', '15eb78dd6e104966ba6112589c29dc41', '5a93c47d6bf34a77a2f8267ef6898943', 'e88a8f520dde445484c0a9395e1a0599', '487e20ab774742378198f94f5b5b0b43', 'c7ce889c796f4e2a8859fa2d7d5068fe', 'e35e65107a34496db49fa5a0b41a1e9e', '6a0f3653b80a4c949e127d6504debb55', 'd5137ebd4f034dc193d216128bb7fc9a', '2fc212b9508e4dc7b5a20bc79e2e9e31', 'b2bbe715b6a14fd19f751cae8adf6b4e', '8a98e383a2d143e798fc23869694934a', '2f3b66a5f98546d4b7691fba57fa640f', '0154d71439284c34b865e5a417cd48af', '88041eddad7542ea8c92b30e5c64e198', '0d0ae3a556414d138c52a6040a203d24', '903742c353ce42c3ad9ab039fc418816', '1b9883393ab344a69bc1a0fab192a94c', '2cd5668ac9054e2eb2c88bb4ed94bc6d', '97953af1b97d4e268c52e1e54dcf421a', 'c9a686318e1448cc81c715fd7e0a5811', 'e06cf95717f448ecb81c440b1b2fe1ab', 'a60a64d82d1c439a901b683b73a74d73', '60e6a6f6ed2e4e838f2bbed6a427028d', '112ab4cb44b84e73815378b997575362', 'e192b8a00b6c422296851c93785deaf7', 'bf774cbe6c3040b0a022278d36a23f19', '531732fee3c24366a286d76eb534aebc', 'd929e7f8b7624d76bdb0ec9ada6cc650', 'f50537eb104e4213908f1862c8160a3e', 'c11da556596342e79a2c62d3b116ea42', '47b5d57bd4354276bb6d2dcd1438901d', 'ac604b44fdca482fb753034cb55d1351', '742fbefae7d745a9bdf644659d21e0fa', 'fc8f71a38c82458dbf9718c3ee11a0f3', 'f2799dc202bc4249b42a4fda8770d1b6', '2931e0a34319495bbb5898201a54feb5', '00db212bc8d044cd839241ab4065e603', '810be63d084746e3b7da9d943dd88e8c', '737ef8494f26407b8b2a6b1b1dc631a4', 'cba570ae38f341faa6257342727377b7', '7381a74ba4f34f40b332ebace7ee9527', '102ff4f7be044cf2bdef164ae3a78262', '950f4287bab5444aa0527cc23fb082b2', 'fbff5e08b7f24a94ab4b2d7371999ef7', '487ad897ba93404a8cbe5de7d1922691', 'b1aed24c863949bfbfa3a844ecf60593', 'd200a61757d84b1dab8fbac35ff52c28', '355e25bdfc244c5e85d358e39432bd44', '19a4c2cf718d40588eb96ac25a566353', 'f289f7001bd94db0b33a7d2e1cd28b19', '4e8b1b7f026c4384827f157225da13fa', '3aeb5494088542fdaf798532951aebb0', 'dfe5ca1bb0854b67a6ffccad9565d669', '15aa4ba144a34b8b8079ed7e049d84df', '8a0473cae53d4720a99c0696cc1fb407', '30e9b141d7894fbfaacecd2fa18929f9']\n" + ] + } + ], + "source": [ + "print(best_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "counts = data.groupby('user_id').size()\n", + "filtered = counts[counts >= 5]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "params = {\n", + " 'ccp_alpha': 0.0031503743500287396,\n", + " 'max_depth': int(5.879792418246912 * 10), \n", + " 'max_features': 0.16332372250446126, \n", + " 'min_samples_leaf': int(1.7742589153489061 * 10), \n", + " 'min_samples_split': int(2.391021401374942 * 10), \n", + " 'n_estimators': int(100 * 0.5038646539940661)\n", + "}\n", + "\n", + "clf = RandomForestClassifier(**params)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['source',\n", + " 'end_ts',\n", + " 'end_fmt_time',\n", + " 'end_loc',\n", + " 'raw_trip',\n", + " 'start_ts',\n", + " 'start_fmt_time',\n", + " 'start_loc',\n", + " 'duration',\n", + " 'distance',\n", + " 'start_place',\n", + " 'end_place',\n", + " 'cleaned_trip',\n", + " 'inferred_labels',\n", + " 'inferred_trip',\n", + " 'expectation',\n", + " 'confidence_threshold',\n", + " 'expected_trip',\n", + " 'user_input',\n", + " 'start:year',\n", + " 'start:month',\n", + " 'start:day',\n", + " 'start:hour',\n", + " 'start_local_dt_minute',\n", + " 'start_local_dt_second',\n", + " 'start_local_dt_weekday',\n", + " 'start_local_dt_timezone',\n", + " 'end:year',\n", + " 'end:month',\n", + " 'end:day',\n", + " 'end:hour',\n", + " 'end_local_dt_minute',\n", + " 'end_local_dt_second',\n", + " 'end_local_dt_weekday',\n", + " 'end_local_dt_timezone',\n", + " '_id',\n", + " 'user_id',\n", + " 'metadata_write_ts',\n", + " 'additions',\n", + " 'mode_confirm',\n", + " 'purpose_confirm',\n", + " 'distance_miles',\n", + " 'Mode_confirm',\n", + " 'Trip_purpose',\n", + " 'original_user_id',\n", + " 'program',\n", + " 'opcode',\n", + " 'Timestamp',\n", + " 'birth_year',\n", + " 'primary_job_commute_time',\n", + " 'income_category',\n", + " 'n_residence_members',\n", + " 'n_residents_u18',\n", + " 'n_residents_with_license',\n", + " 'n_motor_vehicles',\n", + " 'available_modes',\n", + " 'age',\n", + " 'gender_Man',\n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Prefer not to say',\n", + " 'gender_Woman',\n", + " 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", + " 'has_drivers_license_No',\n", + " 'has_drivers_license_Prefer not to say',\n", + " 'has_drivers_license_Yes',\n", + " 'has_multiple_jobs_No',\n", + " 'has_multiple_jobs_Prefer not to say',\n", + " 'has_multiple_jobs_Yes',\n", + " \"highest_education_Bachelor's degree\",\n", + " 'highest_education_Graduate degree or professional degree',\n", + " 'highest_education_High school graduate or GED',\n", + " 'highest_education_Less than a high school graduate',\n", + " 'highest_education_Prefer not to say',\n", + " 'highest_education_Some college or associates degree',\n", + " 'primary_job_type_Full-time',\n", + " 'primary_job_type_Part-time',\n", + " 'primary_job_type_Prefer not to say',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Custodial',\n", + " 'primary_job_description_Education',\n", + " 'primary_job_description_Food service',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Medical/healthcare',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, managerial, or technical',\n", + " 'primary_job_description_Sales or service',\n", + " 'primary_job_commute_mode_Active transport',\n", + " 'primary_job_commute_mode_Car transport',\n", + " 'primary_job_commute_mode_Hybrid',\n", + " 'primary_job_commute_mode_Public transport',\n", + " 'primary_job_commute_mode_Unknown',\n", + " 'primary_job_commute_mode_WFH',\n", + " 'is_overnight_trip',\n", + " 'n_working_residents',\n", + " 'start_lat',\n", + " 'start_lng',\n", + " 'end_lat',\n", + " 'end_lng',\n", + " 'temperature_2m (°F)',\n", + " 'relative_humidity_2m (%)',\n", + " 'dew_point_2m (°F)',\n", + " 'rain (inch)',\n", + " 'snowfall (inch)',\n", + " 'wind_speed_10m (mp/h)',\n", + " 'wind_gusts_10m (mp/h)',\n", + " 'section_distance_argmax',\n", + " 'section_duration_argmax',\n", + " 'section_mode_argmax',\n", + " 'section_coordinates_argmax',\n", + " 'mph',\n", + " 'target',\n", + " 'av_s_micro',\n", + " 'av_ridehail',\n", + " 'av_unknown',\n", + " 'av_car',\n", + " 'av_transit',\n", + " 'av_walk',\n", + " 'av_s_car',\n", + " 'av_no_trip',\n", + " 'av_p_micro',\n", + " 'cost_p_micro',\n", + " 'cost_no_trip',\n", + " 'cost_s_car',\n", + " 'cost_transit',\n", + " 'cost_car',\n", + " 'cost_s_micro',\n", + " 'cost_ridehail',\n", + " 'cost_walk',\n", + " 'cost_unknown']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.drop(columns=[\n", + " 'source',\n", + " 'end_ts',\n", + " 'end_fmt_time',\n", + " 'end_loc',\n", + " 'raw_trip',\n", + " 'start_ts',\n", + " 'start_fmt_time',\n", + " 'start_loc',\n", + " 'duration',\n", + " 'distance',\n", + " 'start_place',\n", + " 'end_place',\n", + " 'cleaned_trip',\n", + " 'inferred_labels',\n", + " 'inferred_trip',\n", + " 'expectation',\n", + " 'confidence_threshold',\n", + " 'expected_trip',\n", + " 'user_input',\n", + " 'start:year',\n", + " 'start:month',\n", + " 'start:day',\n", + " 'start:hour',\n", + " 'start_local_dt_minute',\n", + " 'start_local_dt_second',\n", + " 'start_local_dt_weekday',\n", + " 'start_local_dt_timezone',\n", + " 'end:year',\n", + " 'end:month',\n", + " 'end:day',\n", + " 'end:hour',\n", + " 'end_local_dt_minute',\n", + " 'end_local_dt_second',\n", + " 'end_local_dt_weekday',\n", + " 'end_local_dt_timezone',\n", + " '_id',\n", + " 'user_id',\n", + " 'metadata_write_ts',\n", + " 'additions',\n", + " 'mode_confirm',\n", + " 'purpose_confirm',\n", + " 'distance_miles',\n", + " 'Mode_confirm',\n", + " 'Trip_purpose',\n", + " 'original_user_id',\n", + " 'program',\n", + " 'opcode',\n", + " 'Timestamp',\n", + " 'birth_year',\n", + " 'primary_job_commute_time',\n", + " 'income_category',\n", + " 'n_residence_members',\n", + " 'n_residents_u18',\n", + " 'n_residents_with_license',\n", + " 'n_motor_vehicles',\n", + " 'available_modes',\n", + " 'age',\n", + " 'gender_Man',\n", + " 'gender_Man;Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Nonbinary/genderqueer/genderfluid',\n", + " 'gender_Prefer not to say',\n", + " 'gender_Woman',\n", + " 'gender_Woman;Nonbinary/genderqueer/genderfluid',\n", + " 'has_drivers_license_No',\n", + " 'has_drivers_license_Prefer not to say',\n", + " 'has_drivers_license_Yes',\n", + " 'has_multiple_jobs_No',\n", + " 'has_multiple_jobs_Prefer not to say',\n", + " 'has_multiple_jobs_Yes',\n", + " \"highest_education_Bachelor's degree\",\n", + " 'highest_education_Graduate degree or professional degree',\n", + " 'highest_education_High school graduate or GED',\n", + " 'highest_education_Less than a high school graduate',\n", + " 'highest_education_Prefer not to say',\n", + " 'highest_education_Some college or associates degree',\n", + " 'primary_job_type_Full-time',\n", + " 'primary_job_type_Part-time',\n", + " 'primary_job_type_Prefer not to say',\n", + " 'primary_job_description_Clerical or administrative support',\n", + " 'primary_job_description_Custodial',\n", + " 'primary_job_description_Education',\n", + " 'primary_job_description_Food service',\n", + " 'primary_job_description_Manufacturing, construction, maintenance, or farming',\n", + " 'primary_job_description_Medical/healthcare',\n", + " 'primary_job_description_Other',\n", + " 'primary_job_description_Professional, managerial, or technical',\n", + " 'primary_job_description_Sales or service',\n", + " 'primary_job_commute_mode_Active transport',\n", + " 'primary_job_commute_mode_Car transport',\n", + " 'primary_job_commute_mode_Hybrid',\n", + " 'primary_job_commute_mode_Public transport',\n", + " 'primary_job_commute_mode_Unknown',\n", + " 'primary_job_commute_mode_WFH',\n", + " 'is_overnight_trip',\n", + " 'n_working_residents',\n", + " 'start_lat',\n", + " 'start_lng',\n", + " 'end_lat',\n", + " 'end_lng',\n", + " 'temperature_2m (°F)',\n", + " 'relative_humidity_2m (%)',\n", + " 'dew_point_2m (°F)',\n", + " 'rain (inch)',\n", + " 'snowfall (inch)',\n", + " 'wind_speed_10m (mp/h)',\n", + " 'wind_gusts_10m (mp/h)',\n", + " 'section_distance_argmax',\n", + " 'section_duration_argmax',\n", + " 'section_mode_argmax',\n", + " 'section_coordinates_argmax',\n", + " 'mph',\n", + " 'target',\n", + " 'av_s_micro',\n", + " 'av_ridehail',\n", + " 'av_unknown',\n", + " 'av_car',\n", + " 'av_transit',\n", + " 'av_walk',\n", + " 'av_s_car',\n", + " 'av_no_trip',\n", + " 'av_p_micro',\n", + " 'cost_p_micro',\n", + " 'cost_no_trip',\n", + " 'cost_s_car',\n", + " 'cost_transit',\n", + " 'cost_car',\n", + " 'cost_s_micro',\n", + " 'cost_ridehail',\n", + " 'cost_walk',\n", + " 'cost_unknown'\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import StratifiedGroupKFold\n", + "\n", + "cv = StratifiedGroupKFold(n_splits=2)\n", + "\n", + "for tr_ix, te_ix in cv.split(data)\n", + "\n", + "clf.fit()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ab0c6e94c9422d07d42069ec9e3bb23090f5e156fc0e23cc25ca45a62375bf53" + }, + "kernelspec": { + "display_name": "emission", + "language": "python", + "name": "emission" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/replacement_mode_modeling/experimental_notebooks/rf_bayesian_optim.py b/replacement_mode_modeling/experimental_notebooks/rf_bayesian_optim.py new file mode 100644 index 0000000..6c911bd --- /dev/null +++ b/replacement_mode_modeling/experimental_notebooks/rf_bayesian_optim.py @@ -0,0 +1,280 @@ +import warnings +warnings.simplefilter(action='ignore', category=Warning) + +import os +import numpy as np +import pandas as pd +import pickle +from bayes_opt import BayesianOptimization +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import StratifiedGroupKFold +from sklearn.metrics import f1_score, log_loss, r2_score + +SEED = 13210 + +class BayesianCV: + def __init__(self, data): + + init_splitter = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED) + X = data.drop(columns=['target']) + groups = data.user_id.values + y = data.target.values + + for train_ix, test_ix in init_splitter.split(X, y, groups): + train = data.iloc[train_ix, :] + test = data.iloc[test_ix, :] + + break + + # Can't have split, so let it happen for two times. + # train, test = train_test_split(data, test_size=0.2, shuffle=True, stratify=data.target) + + print("Train-test split done.") + + # Estimate the test durations using the train data. + params, train = self._get_duration_estimate(train, 'train', None) + _, test = self._get_duration_estimate(test, 'test', params) + + # We drop the training duration estimates since we will be re-computing them during CV. + train.drop(columns=[c for c in train.columns if 'tt_' in c], inplace=True) + + # This is out final train and test data. + self.data = train.reset_index(drop=True) + self.test = test.reset_index(drop=True) + + self._optimizer = self._setup_optimizer() + + + def _drop_columns(self, df: pd.DataFrame): + to_drop = [ + 'source', 'end_ts', 'end_fmt_time', 'end_loc', 'raw_trip', 'start_ts', + 'start_fmt_time', 'start_loc', 'duration', 'distance', 'start_place', + 'end_place', 'cleaned_trip', 'inferred_labels', 'inferred_trip', 'expectation', + 'confidence_threshold', 'expected_trip', 'user_input', 'start:year', 'start:month', + 'start:day', 'start_local_dt_minute', 'start_local_dt_second', + 'start_local_dt_weekday', 'start_local_dt_timezone', 'end:year', 'end:month', 'end:day', + 'end_local_dt_minute', 'end_local_dt_second', 'end_local_dt_weekday', + 'end_local_dt_timezone', '_id', 'user_id', 'metadata_write_ts', 'additions', + 'mode_confirm', 'purpose_confirm', 'Mode_confirm', 'Trip_purpose', + 'original_user_id', 'program', 'opcode', 'Timestamp', 'birth_year', + 'available_modes', 'section_coordinates_argmax', 'section_mode_argmax', + 'start_lat', 'start_lng', 'end_lat', 'end_lng' + ] + + # Drop section_mode_argmax and available_modes. + return df.drop( + columns=to_drop, + inplace=False + ) + + + def _get_duration_estimate(self, df: pd.DataFrame, dset: str, model_dict: dict): + + X_features = ['section_distance_argmax', 'age'] + + if 'mph' in df.columns: + X_features += ['mph'] + + if dset == 'train' and model_dict is None: + model_dict = dict() + + if dset == 'test' and model_dict is None: + raise AttributeError("Expected model dict for testing.") + + if dset == 'train': + for section_mode in df.section_mode_argmax.unique(): + section_data = df.loc[df.section_mode_argmax == section_mode, :] + if section_mode not in model_dict: + model_dict[section_mode] = dict() + + model = LinearRegression(fit_intercept=True) + + X = section_data[ + X_features + ] + Y = section_data[['section_duration_argmax']] + + model.fit(X, Y.values.ravel()) + + r2 = r2_score(y_pred=model.predict(X), y_true=Y.values.ravel()) + # print(f"Train R2 for {section_mode}: {r2}") + + model_dict[section_mode]['model'] = model + + elif dset == 'test': + for section_mode in df.section_mode_argmax.unique(): + section_data = df.loc[df.section_mode_argmax == section_mode, :] + X = section_data[ + X_features + ] + Y = section_data[['section_duration_argmax']] + + y_pred = model_dict[section_mode]['model'].predict(X) + r2 = r2_score(y_pred=y_pred, y_true=Y.values.ravel()) + # print(f"Test R2 for {section_mode}: {r2}") + + # Create the new columns for the duration. + new_columns = ['p_micro','no_trip','s_car','transit','car','s_micro','ridehail','walk','unknown'] + df[new_columns] = 0 + df['temp'] = 0 + + for section in df.section_mode_argmax.unique(): + X_section = df.loc[df.section_mode_argmax == section, X_features] + + # broadcast to all columns. + df.loc[df.section_mode_argmax == section, 'temp'] = model_dict[section]['model'].predict(X_section) + + for c in new_columns: + df[c] = df['av_' + c] * df['temp'] + + df.drop(columns=['temp'], inplace=True) + + df.rename(columns=dict([(x, 'tt_'+x) for x in new_columns]), inplace=True) + + # return model_dict, result_df + return model_dict, df + + + def _setup_optimizer(self): + # Define search space. + hparam_dict = { + # 10-500 + 'n_estimators': (0.25, 3), + # 5-150 + 'max_depth': (0.5, 15), + # 2-20 + 'min_samples_split': (0.2, 2.5), + # 1-20 + 'min_samples_leaf': (0.1, 2.5), + # as-is. + 'ccp_alpha': (0., 0.5), + # as-is. + 'max_features': (0.1, 0.99), + # Use clip to establish mask. + 'class_weight': (0, 1), + } + + return BayesianOptimization( + self._surrogate, + hparam_dict + ) + + + def _surrogate(self, n_estimators, max_depth, min_samples_split, min_samples_leaf, ccp_alpha, max_features, class_weight): + + cw = 'balanced_subsample' if class_weight < 0.5 else 'balanced' + + # Builds a surrogate model using the samples hparams. + model = RandomForestClassifier( + n_estimators=int(n_estimators * 100), + max_depth=int(max_depth * 10), + min_samples_split=int(min_samples_split * 10), + min_samples_leaf=int(min_samples_leaf * 10), + max_features=max(min(max_features, 0.999), 1e-3), + ccp_alpha=ccp_alpha, + bootstrap=True, + class_weight=cw, + n_jobs=os.cpu_count(), + random_state=SEED + ) + + fold_crossentropy = list() + + # Use the train split and further split in train-val. + X = self.data.drop(columns=['target']) + y = self.data.target.values.ravel() + users = X.user_id.values + + gkfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED) + + for train_ix, test_ix in gkfold.split(X, y, users): + + X_train = X.iloc[train_ix, :] + X_test = X.iloc[test_ix, :] + + y_train = y[train_ix] + y_test = y[test_ix] + + # Re-estimate durations. + params, X_train = self._get_duration_estimate(X_train, 'train', None) + _, X_test = self._get_duration_estimate(X_test, 'test', params) + + X_train = self._drop_columns(X_train) + X_test = self._drop_columns(X_test) + + model.fit( + X_train, + y_train + ) + + # Measure performance on valid split. + ce = log_loss( + y_true=y_test, + y_pred=model.predict_proba(X_test), + labels=list(range(1, 10)) + ) + + fold_crossentropy.append(ce) + + # Return the average negative crossentropy (since bayesian optimization aims to maximize an objective). + return -np.mean(fold_crossentropy) + + + def optimize(self): + self._optimizer.maximize(n_iter=100, init_points=10) + print("Done optimizing!") + best_params = self._optimizer.max['params'] + best_loss = -self._optimizer.max['target'] + return best_loss, best_params + + +def train_final_model(params, cv_obj): + # Construct the model using the params. + model = RandomForestClassifier( + n_estimators=int(params['n_estimators'] * 100), + max_depth=int(params['max_depth'] * 10), + min_samples_split=int(params['min_samples_split'] * 10), + min_samples_leaf=int(params['min_samples_leaf'] * 10), + max_features=params['max_features'], + ccp_alpha=params['ccp_alpha'], + bootstrap=True, + class_weight='balanced_subsample', + n_jobs=os.cpu_count() + ) + + + X_tr = cv_obj.data.drop(columns=['target']) + y_tr = cv_obj.data.target.values.ravel() + + X_te = cv_obj.test.drop(columns=['target']) + y_te = cv_obj.test.target.values.ravel() + + params, X_tr = cv_obj._get_duration_estimate(X_tr, 'train', None) + + X_tr = cv_obj._drop_columns(X_tr) + X_te = cv_obj._drop_columns(X_te) + + model.fit( + X_tr, + y_tr + ) + + model.fit(X_tr, y_tr) + + print(f"Train loss: {log_loss(y_true=y_tr, y_pred=model.predict_proba(X_tr))}") + print(f"Train performance: {f1_score(y_true=y_tr, y_pred=model.predict(X_tr), average='weighted')}") + print(f"Test loss: {log_loss(y_true=y_te, y_pred=model.predict_proba(X_te))}") + print(f"Test performance: {f1_score(y_true=y_te, y_pred=model.predict(X_te), average='weighted')}") + + with open('./bayes_rf.pkl', 'wb') as f: + f.write(pickle.dumps(model)) + + +if __name__ == "__main__": + data = pd.read_csv('../data/ReplacedMode_Fix_02142024.csv') + bayes_cv = BayesianCV(data) + best_loss, best_params = bayes_cv.optimize() + print(f"Best loss: {best_loss}, best params: {str(best_params)}") + train_final_model(best_params, bayes_cv) + \ No newline at end of file