From ddde926aeef8fb87c68aef151bb1e647b940e9c5 Mon Sep 17 00:00:00 2001
From: Aksel Olsen <akselkolsen@gmail.com>
Date: Mon, 6 May 2024 21:48:28 -0700
Subject: [PATCH] connect transit - update to take a simpler transit area
 summary (binary ; either inside or outside). - Also update run directory with
 a 'variant' field, with a clearer domain - Draft Blueprint, No Project. We
 use that field as the source of the modelrun_alias variable, otherwise before
 sourced by the alias field.

---
 scripts/metrics/metrics_connected.py     | 373 ++++++++++++++++++++++-
 scripts/metrics/metrics_lu_standalone.py |   7 +-
 scripts/metrics/metrics_utils.py         | 130 +++++++-
 3 files changed, 487 insertions(+), 23 deletions(-)

diff --git a/scripts/metrics/metrics_connected.py b/scripts/metrics/metrics_connected.py
index 9a0d3501..d1d1596c 100644
--- a/scripts/metrics/metrics_connected.py
+++ b/scripts/metrics/metrics_connected.py
@@ -44,7 +44,7 @@ def groupby_summaries(df, group_vars):
 
         # this assumed the FIRST group level is the area and not the transit service area one!!
         # We are in other words getting the within-group distribution by transit service area
-        # so, for HRAs, what is the share living in major transit areas; same for the region
+        # so, for HRAs, what is the share living in major transit areas vs outside; same for the region
 
         grp_summary_shares = (
             grp_summary
@@ -63,7 +63,8 @@ def groupby_summaries(df, group_vars):
     # where stops and headways will differ for np and dbp. The parcel crosswalk has different classifications
     # for np and dbp.
 
-    this_modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias)
+    #this_modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias)
+    
 
     transit_scenario_mapping = {
         "NoProject": "np",
@@ -82,7 +83,7 @@ def groupby_summaries(df, group_vars):
 
     # get the transit scenario to focus on (e.g., 'fbp' for final bluerprint)
     # we really need to have a distinct fbp/eir, etc designation from run id in the log
-    transit_scenario = transit_scenario_mapping.get(this_modelrun_alias,'fbp')
+    transit_scenario = transit_scenario_mapping.get(modelrun_alias,'dbp')
 
     # Define columns containing values of interest - more could be added as long as it is present and numeric
     val_cols = ["totemp", "RETEMPN", "MWTEMPN","OTHEMPN","HEREMPN","FPSEMPN", "tothh", "hhq1"]
@@ -173,7 +174,7 @@ def groupby_summaries(df, group_vars):
         names=[
             "year",
             "area_grouping",          # i.e. the area concept
-            # i.e.  the classification variable (5-way; 6-way)
+            # i.e.  the classification variable (5-way; 6-way; 2-way)
             "transit_grouping",
             "area_detail",            # i.e. the area concept domain levels
             "service_level_detail",   # i.e. the headway / stop type
@@ -319,4 +320,366 @@ def format_for_tableau(plan_metrics_df: pd.DataFrame,
     ]
     val_col = ["transitproximity_majorstop_shareof"]
 
-    return metrics_tableau_schema[id_cols + id_extra_cols + val_col]
\ No newline at end of file
+    return metrics_tableau_schema[id_cols + id_extra_cols + val_col]
+
+def format_for_tableau_v2(plan_metrics_df: pd.DataFrame #qrystring: str,
+                       ) -> pd.DataFrame:
+    """Formats the output for Tableau to fit the format of 
+    https://github.com/BayAreaMetro/bayarea_urbansim/blob/main/scripts/proximity2transit.py.
+
+    Args:
+        plan_metrics_df (pd.DataFrame): The DataFrame containing transit proximity metrics.
+        qrystring (str): A query string to filter the full DataFrame to smaller subsets.
+
+    Returns:
+        pd.DataFrame: The formatted DataFrame.
+    """
+    logging.debug(f"Formatting for Tableau")
+    #logging.debug(f"Query string: {qrystring}")
+    logging.debug(f"Number of metrics: {len(plan_metrics_df)}")
+
+    tableau_column_map = { 'year':'year','modelrun_id':'modelrun_id',
+        "modelrun_alias": "modelrun_alias",
+        "area_grouping": "area_grouping",
+        "transit_grouping": "transit_grouping",
+        "area_detail": "area_detail",
+        "service_level_detail": "service_level_detail",
+        "variable": "type_code",
+        "value": "transitproximity_majorstop_shareof",
+    }
+    # tableau friendly names: high level
+    tableau_type_code_map = {
+        "MWTEMPN": "Jobs",
+        "RETEMPN": "Jobs",
+        "OTHEMPN":'Jobs',
+        "HEREMPN":'Jobs',
+        "FPSEMPN":'Jobs',
+        "totemp": "Jobs",
+        "hhq1": "Households",
+        "tothh": "Households",
+    }
+    
+    # tableau friendly names: detail
+    tableau_type_subgroup_map = {
+        "MWTEMPN": "Manufacturing and Warehousing Jobs",
+        "RETEMPN": "Retail Jobs",
+        "OTHEMPN":'Government and Information Jobs',
+        "HEREMPN":'Health, Education and Recreational Jobs',
+        "FPSEMPN":'Financial and Professional Services Jobs',
+        
+        "totemp": "All Jobs",
+        "hhq1": "Households with Low Incomes",
+        "tothh": "All Households",
+    }
+
+    # first, we subset away most of the categories
+    plan_metrics_df_for_tableau = plan_metrics_df#.query(qrystring)
+
+    logging.debug(f'After filtering of output for tableau: {plan_metrics_df_for_tableau.head()}')
+    
+    # then we deal with the schema and domains
+    metrics_tableau_schema = (
+        plan_metrics_df_for_tableau[
+        tableau_column_map.keys()
+        ]
+        .rename(columns=tableau_column_map)
+        )
+    
+    # assign group name
+    metrics_tableau_schema["type_group"] = metrics_tableau_schema.type_code.map(
+        tableau_type_code_map
+    )
+    
+    # assign detailed group name
+    metrics_tableau_schema["type_subgroup"] = metrics_tableau_schema.type_code.map(
+        tableau_type_subgroup_map
+    )
+
+    # prepare output schema
+    id_cols = ["year","modelrun_id","modelrun_alias", "type_code", "type_group", "type_subgroup"]
+    id_extra_cols = [
+        "area_grouping",
+        "transit_grouping",
+        "area_detail",
+        "service_level_detail",
+    ]
+    val_col = ["transitproximity_majorstop_shareof"]
+
+    return metrics_tableau_schema[id_cols + id_extra_cols + val_col]
+
+
+def transit_service_area_share_v2(
+                                rtp: str,
+                                modelrun_alias: str, 
+                                modelrun_id: str, 
+                                modelrun_data: dict, 
+                                output_path: str,
+                                append_output: bool
+                                ):
+
+    
+    # use later for adding county from fips code
+    bayareafips = {
+        "06001": "Alameda",
+        "06013": "Contra Costa",
+        "06041": "Marin",
+        "06055": "Napa",
+        "06075": "San Francisco",
+        "06081": "San Mateo",
+        "06085": "Santa Clara",
+        "06097": "Sonoma",
+        "06095": "Solano",
+    }
+
+    # The transit_scenario_mapping relates the run type to which transit stop buffer universe is appropriate for the tabulation. 
+    # e.g. final blueprint simulation runs should be matched with similar scenario transit stops / headways.
+    
+    # Left hand side is modelrunid_alias - we want to relate that to the specific transit stop map to use
+    # where stops and headways will differ for np and dbp. The parcel crosswalk has different classifications
+    # for np and dbp.
+
+    transit_scenario_mapping = {
+        "NoProject": "np",
+        "No Project": "np", # allowing for variations in input here
+        "NP":'np',
+        "Plus": "dbp",
+        "Final Blueprint": "fbp",
+        "Draft Blueprint": "dbp",
+        "DBP": "dbp", 
+        "Alt1": "fbp",
+        "Alt2": "fbp",
+        "EIR Alt 1":'fbp',
+        "EIR Alt 2":'fbp',
+        "Current": "cur", # refers to existing conditions transit stops
+    }
+    
+    # set Boolean to track if a base year run is already processed         
+    is_baseyear_processed = False
+    
+    logging.info(f"Calculating connected for {modelrun_alias} / {modelrun_id}")
+    logging.debug(f"Modelrun data years: {modelrun_data.keys()}")
+    logging.debug(f"Modelrun data 2050 datasets: {modelrun_data[2050].keys()}")
+
+    # convenience function for easy groupby percentages
+    def pct(x):
+        return x / x.sum()
+
+    # convenience function for indicator specific groupby summaries, with variable groups
+    def groupby_summaries(df, group_vars):
+
+        grp_summary = (
+            df.groupby(group_vars).agg(
+                agg_mapping
+            )
+        )
+        logging.debug(
+            f'Summarizing parcels with respect to: {"; ".join(group_vars)}')
+        logging.debug(f"{grp_summary.head()}")
+
+        # this assumed the FIRST group level is the area and not the transit service area one!!
+        # We are in other words getting the within-group distribution by transit service area
+        # so, for HRAs, what is the share living in major transit areas; same for the region
+
+        grp_summary_shares = (
+            grp_summary
+            .groupby(level=group_vars[0],
+                    group_keys=False)
+            .apply(pct)
+            .round(3)
+        )
+
+        return grp_summary_shares
+
+
+    # Define columns containing values of interest - more could be added as long as it is present and numeric
+    val_cols = ["totemp", "RETEMPN", "MWTEMPN","OTHEMPN","HEREMPN","FPSEMPN", "tothh", "hhq1"]
+    
+    # use for groupby later to say what do do with each col - we just sum them
+    agg_mapping = {col: 'sum' for col in val_cols}
+
+    # get a shorthand alias from the often more verbose one
+    #TODO: this should be fixed on the inventory side - using the description field for details
+    #modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias)
+    
+
+    SUMMARY_YEARS = sorted(modelrun_data.keys())
+
+    # for storing yearly data
+    container = {}
+
+
+    for year in SUMMARY_YEARS:
+
+        # set boolean for baseyear status
+        is_baseyear = int(year) in [2015, 2020, 2023]
+
+        if is_baseyear and is_baseyear_processed:
+            # if both true, we can skip this iteration
+            logging.info('A baseyear NP run is encountered - we skip.')
+            continue
+
+        if is_baseyear:
+            transit_scenario = "cur"  # Existing stops buffers for base year
+
+            if modelrun_alias == 'Draft Blueprint':
+                # Skip Draft Blueprint for base year
+                logging.info('Skipping baseyear Draft Blueprint run.')
+                continue
+            elif modelrun_alias == 'No Project':
+                is_baseyear_processed = True  # Set only for No Project within a base year
+                logging.info('Encountered a baseyear No Project run - only processing one base year.')
+
+        # continue processing    
+        # get the transit scenario geographies to focus on (e.g., 'fbp' for final bluerprint), 'cur' for current / baseyear conditions
+        transit_scenario = transit_scenario_mapping.get(modelrun_alias,'dbp')
+
+        # set transit_scenario to cur (existing stops buffers) - this will override any value to 'cur'
+        if is_baseyear:
+            # overrides to "cur" if is_baseyear - meaning np only uses np buffers in the future year
+            transit_scenario = "cur" 
+            if modelrun_alias=='No Project':
+                logging.info('A baseyear NP run is encountered - setting is_baseyear_processed to True for subsequent skipping. We only need one baseyear run.')
+                is_baseyear_processed = True
+            elif modelrun_alias=='Draft Blueprint':
+                logging.info('A baseyear DBP run is encountered - we skip.')
+                continue
+
+
+        parcel_output = modelrun_data[year]["parcel"]
+        # report shape of parcel_output df
+        len_parcels = len(parcel_output)
+
+        logging.debug('Cols of parcels {} in connected func: {}'.format(year,parcel_output.columns))
+        logging.debug(f"Parcel output has {len_parcels:,} rows")
+
+        # adding county field from tract ids
+        parcel_output['county'] = parcel_output.tract20.map(lambda x: f'{x:011.0f}').str.slice(0,5).map(bayareafips).fillna('MISSING')
+        logging.info(f'counts of parcel by county: {parcel_output['county'].value_counts()}')
+        
+        # Identify the passed scenario-specific columns (fbp no project, current)
+        # this returns different classifications for each - like the 5-way or 6-way service level (cat5, cat6)
+        # several may be returned depending on how many are in the crosswalk
+        # we summarize run data for each classification variable
+
+        transit_svcs_cols = parcel_output.filter(
+            regex=transit_scenario
+        ).columns.tolist()
+
+        logging.debug(
+            f'Transit scenario specific for {transit_scenario=} classifier columns: {"; ".join(transit_svcs_cols)}'
+        )
+
+        # Fill missing values and convert specific columns to integers for consistency
+        for col in val_cols:
+            parcel_output[col] = parcel_output[col].fillna(0).round(0).astype(int)
+
+        # convenience function for easy groupby percentages
+
+        # Now for the summaries - a variable (e.g. tothh) for a geographic area (e.g. region, hra, etc.)
+        # is distributed by transit service area (e.g. cat5, cat6, etc.): what share of households
+        # live in major transit areas? what share of jobs "live" in major transit areas?
+
+
+        # we define area_vars to hold different areas of interest to use
+        # for the groupby summaries
+        if rtp=="RTP2021":
+            # RTP2021 was tract10-based
+            area_vars = {'region': "Region", 'tract10_epc': 'CoCs',
+                         'county':'County',
+                        'tract10_hra': 'HRAs', 'area_type': 'area_type'}
+            
+            parcel_output['tract10_hra'] = parcel_output['tract10_hra'].replace({0:'Not HRA',1:'HRAs'})
+            parcel_output['tract10_epc'] = parcel_output['tract10_epc'].replace({0:'Not EPC',1:'EPCs'})
+        if rtp=="RTP2025":
+            area_vars = {'region': "Region", 'tract20_epc': 'CoCs',
+                         'county':'County',
+                        'tract20_hra': 'HRAs', 'area_type': 'area_type'}
+            
+            parcel_output['tract20_hra'] = parcel_output['tract20_hra'].replace({-1:'Not HRA',0:'Not HRA',1:'HRAs'})
+            parcel_output['tract20_epc'] = parcel_output['tract20_epc'].replace({-1:'Not EPC',0:'Not EPC',1:'EPCs'})
+
+        parcel_output[transit_svcs_cols] = (parcel_output[transit_svcs_cols]
+                                            .fillna(0)
+                                            .replace({-1:'Outside Major Transit Buffer',
+                                                      0:'Outside Major Transit Buffer',
+                                                      1:'Inside Major Transit Buffer'}))
+        
+        # Add constant useful for constistent handling of groupby summaries 
+        # across the whole region - we can use this as an area grouping variable
+        # analogous to any other area grouping variable for consistent treatment
+        parcel_output['region'] = 'Region'
+        
+        # main event - loop through combinations of area types and transit service area classifications
+        for combo in product(*[area_vars, transit_svcs_cols]):
+
+            # run summary on this combination
+            logging.debug(f"Running parcel summaries for {'-'.join(combo)}")
+            this_summary_servicelevel = groupby_summaries(
+                parcel_output, list(combo)
+            )
+
+            # new key, including year - making a new tuple
+            this_key = (year,) + combo
+            
+            # store in the dict
+            container[this_key] = this_summary_servicelevel
+            logging.debug(f"Summary for {this_key} is {this_summary_servicelevel}")
+
+    
+    
+    # after the loop we collect the dict of dataframes into a single dataframe
+    container_df = pd.concat(
+        container,
+        names=[
+            "year",
+            "area_grouping",          # i.e. the area concept
+            # i.e.  the classification variable (5-way; 6-way)
+            "transit_grouping",
+            "area_detail",            # i.e. the area concept domain levels
+            "service_level_detail",   # i.e. the headway / stop type
+        ],
+    )
+    
+    # turn to long format series w multi-index
+    container_df.columns = container_df.columns.set_names('variable')
+    container_df = container_df.stack()
+
+    logging.debug(f"head for (wide) {container_df.head()}")
+
+    container_df = (
+        container_df.round(3)
+        .reset_index(name="value")
+        #.rename(columns={f"level_{container_df.index.nlevels-1}": "variable"})
+    )
+    logging.debug(f"head for (long) {container_df.head()}")
+
+
+    container_df["metric"] = "C1"
+    container_df["modelrun_alias"] = modelrun_alias
+    container_df["modelrun_id"] = modelrun_id
+    
+    
+    # Finally, clean up area values a bit
+
+    container_df['area_grouping'] = container_df.area_grouping.map(area_vars)
+
+    # basic high level summary - regional totals, major stops only
+    #qry_string_cat5_basic = 'area_grouping=="Region" & transit_grouping.str.contains("cat5$") & service_level_detail=="majorstop"'
+    
+    # detailed version, without limiting to major stops but all headway categories
+    #qry_string_cat5_detail = 'transit_grouping.str.contains("cat5$") '
+    
+    # look through the two query strings and filter the larger frame as appropriate - a basic and a detailed one
+    #for det, qrystr in {'basic':qry_string_cat5_basic,'detail':qry_string_cat5_detail}.items():
+
+
+    updated_metrics_tableau_schema = format_for_tableau_v2(
+        container_df
+    )
+
+    filename = f"metrics_connected1_transitproximity_v2.csv"
+    filepath = output_path / filename
+
+    updated_metrics_tableau_schema.to_csv(filepath, mode='a' if append_output else 'w', header=False if append_output else True, index=False)
+    logging.info("{} {:,} lines to {}".format("Appended" if append_output else "Wrote", len(updated_metrics_tableau_schema), filepath))
+
diff --git a/scripts/metrics/metrics_lu_standalone.py b/scripts/metrics/metrics_lu_standalone.py
index 61e86b55..43d54936 100644
--- a/scripts/metrics/metrics_lu_standalone.py
+++ b/scripts/metrics/metrics_lu_standalone.py
@@ -142,7 +142,10 @@ def main():
 
         # directory is relative to MODEL_RUNS_DIR
         run_directory_path = MODEL_RUNS_DIR / row['directory']
-        modelrun_alias = row['alias']
+        
+        # switch to 'variant' column which is less verbose with respect to "No Project [some detail]"
+        #modelrun_alias = row['alias']
+        modelrun_alias = row['variant']
         modelrun_id = row['directory']
 
         logging.info(f"Processing run modelrun_alias:[{modelrun_alias}] modelrun_id:[{modelrun_id}] run_directory_path:{run_directory_path}")
@@ -223,7 +226,7 @@ def main():
                 args.rtp, modelrun_alias, modelrun_id, BOX_DIR, OUTPUT_PATH, append_output)
             
         if (args.only == None) or (args.only == 'connected'):
-            metrics_connected.transit_service_area_share(
+            metrics_connected.transit_service_area_share_v2(
                 args.rtp, modelrun_alias, modelrun_id, modelrun_data, OUTPUT_PATH, append_output)
 
         if (args.only == None) or (args.only == 'healthy'):
diff --git a/scripts/metrics/metrics_utils.py b/scripts/metrics/metrics_utils.py
index d38918ff..7824b78a 100644
--- a/scripts/metrics/metrics_utils.py
+++ b/scripts/metrics/metrics_utils.py
@@ -3,6 +3,7 @@
 import logging
 from datetime import datetime
 import pathlib
+import getpass
 import os
 
 # make global so we only read once
@@ -12,6 +13,7 @@
 
 rtp2025_transit_service_df      = pd.DataFrame() # parcel -> transit service
 rtp2025_taz_crosswalk_df        = pd.DataFrame() # taz1 -> epc
+rtp2025_parcel_taz_crosswalk_df = pd.DataFrame() # parcel -> taz1
 
 rtp2025_np_parcel_inundation_df    = pd.DataFrame() # parcel -> parcel sea level rise inundation
 rtp2025_dbp_parcel_inundation_df    = pd.DataFrame() # parcel -> parcel sea level rise inundation
@@ -55,7 +57,12 @@
 # set the path for M: drive
 # from OSX, M:/ may be mounted to /Volumes/Data/Models
 M_DRIVE = pathlib.Path("/Volumes/Data/Models") if os.name != "nt" else pathlib.Path("M:/")
-
+USERNAME = getpass.getuser()
+HOME_DIR = pathlib.Path.home()
+if USERNAME.lower() in ['lzorn']:
+    BOX_DIR = pathlib.Path("E:/Box")
+else:
+    BOX_DIR = HOME_DIR / 'Box'
 
 # --------------------------------------
 # Data Loading Based on Model Run Plan
@@ -96,6 +103,8 @@ def load_data_for_runs(
     global rtp2025_urban_area_crosswalk_df
     global rtp2025_transit_service_df
     global rtp2025_taz_crosswalk_df
+
+    global rtp2025_parcel_taz_crosswalk_df
     global rtp2025_np_parcel_inundation_df
     global rtp2025_dbp_parcel_inundation_df
 
@@ -130,11 +139,27 @@ def load_data_for_runs(
             logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_urban_area_crosswalk_df), URBAN_AREA_CROSSWALK_FILE))
             logging.debug("  rtp2025_urban_area_crosswalk_df.head():\n{}".format(rtp2025_urban_area_crosswalk_df.head()))
 
-        # transit service areas
+        # transit service areas (used through April 2024 - with n-category transit service areas including headway differentiation)
+        # We used with transit_service_area_share_v2().
+
+        # if len(rtp2025_transit_service_df) == 0:
+        #     import geopandas as gpd
+        #     PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet"
+        #     rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE)
+        #     transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5']
+        #     rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep]
+        #     logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE))
+        #     logging.debug("  rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head()))
+
+        # simpler version with binary 1/0 classification instead of headway differentiation. We use with transit_service_area_share_v2().
+        
         if len(rtp2025_transit_service_df) == 0:
-            PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet"
-            rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE)
-            transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5']
+            import geopandas as gpd
+            PARCEL_TRANSITSERVICE_FILE = BOX_DIR / 'Plan Bay Area 2050+' / 'Blueprint' / \
+                'Draft Blueprint Modeling and Metrics' / \
+                'transportation' / "p10_x_transit_area_identity.csv"
+            rtp2025_transit_service_df = pd.read_csv(PARCEL_TRANSITSERVICE_FILE)
+            transit_cols_keep = ['parcel_id','cur','np', 'dbp']
             rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep]
             logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE))
             logging.debug("  rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head()))
@@ -241,6 +266,7 @@ def load_data_for_runs(
             logging.debug("final rtp2025_tract_crosswalk_df.dtypes():\n{}".format(rtp2025_tract_crosswalk_df.dtypes))
             # columns are: parcel_id, tract10, tract20, tract20_epc, tract20_growth_geo, tract20_tra, tract20_hra, tract10_DispRisk
 
+
         if len(rtp2025_taz_crosswalk_df) == 0:
 
             # taz-based lookups
@@ -249,6 +275,43 @@ def load_data_for_runs(
             logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_taz_crosswalk_df), TAZ_EPC_CROSSWALK_FILE))
             logging.debug("  rtp2025_taz_crosswalk_df.head():\n{}".format(rtp2025_taz_crosswalk_df.head()))
 
+        if len(rtp2025_parcel_taz_crosswalk_df)==0:
+
+            # parcels to taz crosswalk - we need this for the area_type (suburban/urban/rural) taz-based classification
+
+            PARCEL_TAZ_CROSSWALK_FILE = M_DRIVE /  "urban_modeling" / "baus" / "BAUS Inputs" / "basis_inputs" / "crosswalks" / "2020_08_17_parcel_to_taz1454sub.csv"
+            rtp2025_parcel_taz_crosswalk_df = pd.read_csv(PARCEL_TAZ_CROSSWALK_FILE, usecols=['PARCEL_ID', 'ZONE_ID'])
+            rtp2025_parcel_taz_crosswalk_df.columns = rtp2025_parcel_taz_crosswalk_df.columns.str.lower()
+            logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_parcel_taz_crosswalk_df), PARCEL_TAZ_CROSSWALK_FILE))
+            logging.debug("  rtp2025_parcel_taz_crosswalk_df.head():\n{}".format(rtp2025_parcel_taz_crosswalk_df.head()))
+            
+            # taz-based lookups to area_type (urban/suburban/rural)
+            TAZ_AREATYPE_CROSSWALK_FILE = METRICS_DIR / "metrics_input_files" / "taz_urban_suburban.csv"
+            rtp2025_taz_areatype_crosswalk_df = pd.read_csv(TAZ_AREATYPE_CROSSWALK_FILE, usecols=['TAZ1454','area_type'])
+            logging.info("  Read {:,} rows from taz areatype crosswalk {}".format(len(rtp2025_taz_areatype_crosswalk_df), TAZ_AREATYPE_CROSSWALK_FILE))
+            logging.debug("  rtp2025_taz_areatype_crosswalk_df.head():\n{}".format(rtp2025_taz_areatype_crosswalk_df.head()))
+
+           
+            rtp2025_parcel_taz_crosswalk_df = pd.merge(
+                left     = rtp2025_parcel_taz_crosswalk_df,
+                right    = rtp2025_taz_areatype_crosswalk_df,
+                left_on  = 'zone_id',
+                right_on = 'TAZ1454',
+                how      = 'left',
+                validate = 'many_to_one',
+                indicator=True
+            )
+            logging.debug("rtp2025_parcel_taz_crosswalk_df._merge.value_counts():\n{}".format(
+                          rtp2025_parcel_taz_crosswalk_df._merge.value_counts()))
+            rtp2025_parcel_taz_crosswalk_df.drop(columns=['_merge'], inplace=True)
+
+            # fillna with zero
+            rtp2025_parcel_taz_crosswalk_df.fillna(0, inplace=True)
+
+            logging.debug("rtp2025_parcel_taz_crosswalk_df.head():\n{}".format(rtp2025_parcel_taz_crosswalk_df))
+            logging.debug("rtp2025_parcel_taz_crosswalk_df.dtypes():\n{}".format(rtp2025_parcel_taz_crosswalk_df.dtypes))
+            
+            
         if len(rtp2025_np_parcel_inundation_df) == 0:
             PARCEL_INUNDATION_FILE = METRICS_DIR / "metrics_input_files" / "slr_parcel_inundation_PBA50Plus_NP.csv"
             rtp2025_np_parcel_inundation_df = pd.read_csv(PARCEL_INUNDATION_FILE)
@@ -355,11 +418,27 @@ def load_data_for_runs(
             logging.debug("final rtp2021_tract_crosswalk_df.head():\n{}".format(rtp2021_tract_crosswalk_df))
             # columns are: parcel_id, tract10, tract10_epc, tract10_DispRisk, tract10_hra, tract10_growth_geo, tract10_tra
 
-        # transit service areas # works for both RTP2021 and RTP2025
+        # transit service areas (used through April 2024 - with n-category transit service areas including headway differentiation)
+        # We used with transit_service_area_share_v2().
+
+        # if len(rtp2025_transit_service_df) == 0:
+        #     import geopandas as gpd
+        #     PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet"
+        #     rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE)
+        #     transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5']
+        #     rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep]
+        #     logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE))
+        #     logging.debug("  rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head()))
+
+        # simpler version with binary 1/0 classification instead of headway differentiation. We use with transit_service_area_share_v2().
+        
         if len(rtp2025_transit_service_df) == 0:
-            PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet"
-            rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE)
-            transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5']
+            import geopandas as gpd
+            PARCEL_TRANSITSERVICE_FILE = BOX_DIR / 'Plan Bay Area 2050+' / 'Blueprint' / \
+                'Draft Blueprint Modeling and Metrics' / \
+                'transportation' / "p10_x_transit_area_identity.csv"
+            rtp2025_transit_service_df = pd.read_csv(PARCEL_TRANSITSERVICE_FILE)
+            transit_cols_keep = ['parcel_id','cur','np', 'dbp']
             rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep]
             logging.info("  Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE))
             logging.debug("  rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head()))
@@ -476,13 +555,27 @@ def load_data_for_runs(
                 left     = parcel_df,
                 right    = rtp2025_transit_service_df,
                 how      = "left",
-                left_on  = "parcel_id",
-                right_on ="PARCEL_ID",
+                on       = "parcel_id",
+                #right_on = "PARCEL_ID", # not needed with the current crosswalk
                 validate = "one_to_one"
             )
 
             logging.debug("parcel_df.dtypes:\n{}".format(parcel_df.dtypes))
-            logging.debug("Head after merge with rtp2025_tract_crosswalk_df:\n{}".format(parcel_df.head()))
+            logging.debug("Head after merge with rtp2025_transit_service_df:\n{}".format(parcel_df.head()))
+
+            # add area_type (urban/suburban/rural) lookups
+            parcel_df = pd.merge(
+                left     = parcel_df,
+                right    = rtp2025_parcel_taz_crosswalk_df,
+                how      = "left",
+                on       = "parcel_id",
+                #right_on = "PARCEL_ID", # not needed with the current crosswalk
+                validate = "one_to_one"
+            )
+
+            logging.debug("parcel_df.dtypes:\n{}".format(parcel_df.dtypes))
+            logging.debug("Head after merge with rtp2025_parcel_taz_crosswalk_df:\n{}".format(parcel_df.head()))
+
 
             # add parcel lookup for 2020 urban area footprint
             parcel_df = pd.merge(
@@ -550,8 +643,8 @@ def load_data_for_runs(
                 left     = parcel_df,
                 right    = rtp2025_transit_service_df,
                 how      = "left",
-                left_on  = "parcel_id",
-                right_on ="PARCEL_ID",
+                on       = "parcel_id",
+                #right_on ="PARCEL_ID",
                 validate = "one_to_one"
             )
 
@@ -598,11 +691,16 @@ def load_data_for_runs(
                                 'MWTEMPN', 'RETEMPN', 'FPSEMPN', 'HEREMPN', 'OTHEMPN',
                                 # tract-level columns
                                 'tract10_epc', 'tract10_DispRisk', 'tract10_hra', 'tract10_growth_geo', 'tract10_tra',
+                                
                                 # transit-related columns
-                                'area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5',
+                                #'area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5',
+                                
+                                # use after may 3 2024
+                                'np','cur','dbp',
+                                
                                 # sea level rise column
                                 "inundation"]
-            
+
             parcel_df = parcel_df[columns_to_keep]
             logging.debug("parcel_df:\n{}".format(parcel_df.head(30)))