From ddde926aeef8fb87c68aef151bb1e647b940e9c5 Mon Sep 17 00:00:00 2001 From: Aksel Olsen Date: Mon, 6 May 2024 21:48:28 -0700 Subject: [PATCH] connect transit - update to take a simpler transit area summary (binary ; either inside or outside). - Also update run directory with a 'variant' field, with a clearer domain - Draft Blueprint, No Project. We use that field as the source of the modelrun_alias variable, otherwise before sourced by the alias field. --- scripts/metrics/metrics_connected.py | 373 ++++++++++++++++++++++- scripts/metrics/metrics_lu_standalone.py | 7 +- scripts/metrics/metrics_utils.py | 130 +++++++- 3 files changed, 487 insertions(+), 23 deletions(-) diff --git a/scripts/metrics/metrics_connected.py b/scripts/metrics/metrics_connected.py index 9a0d3501..d1d1596c 100644 --- a/scripts/metrics/metrics_connected.py +++ b/scripts/metrics/metrics_connected.py @@ -44,7 +44,7 @@ def groupby_summaries(df, group_vars): # this assumed the FIRST group level is the area and not the transit service area one!! # We are in other words getting the within-group distribution by transit service area - # so, for HRAs, what is the share living in major transit areas; same for the region + # so, for HRAs, what is the share living in major transit areas vs outside; same for the region grp_summary_shares = ( grp_summary @@ -63,7 +63,8 @@ def groupby_summaries(df, group_vars): # where stops and headways will differ for np and dbp. The parcel crosswalk has different classifications # for np and dbp. - this_modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias) + #this_modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias) + transit_scenario_mapping = { "NoProject": "np", @@ -82,7 +83,7 @@ def groupby_summaries(df, group_vars): # get the transit scenario to focus on (e.g., 'fbp' for final bluerprint) # we really need to have a distinct fbp/eir, etc designation from run id in the log - transit_scenario = transit_scenario_mapping.get(this_modelrun_alias,'fbp') + transit_scenario = transit_scenario_mapping.get(modelrun_alias,'dbp') # Define columns containing values of interest - more could be added as long as it is present and numeric val_cols = ["totemp", "RETEMPN", "MWTEMPN","OTHEMPN","HEREMPN","FPSEMPN", "tothh", "hhq1"] @@ -173,7 +174,7 @@ def groupby_summaries(df, group_vars): names=[ "year", "area_grouping", # i.e. the area concept - # i.e. the classification variable (5-way; 6-way) + # i.e. the classification variable (5-way; 6-way; 2-way) "transit_grouping", "area_detail", # i.e. the area concept domain levels "service_level_detail", # i.e. the headway / stop type @@ -319,4 +320,366 @@ def format_for_tableau(plan_metrics_df: pd.DataFrame, ] val_col = ["transitproximity_majorstop_shareof"] - return metrics_tableau_schema[id_cols + id_extra_cols + val_col] \ No newline at end of file + return metrics_tableau_schema[id_cols + id_extra_cols + val_col] + +def format_for_tableau_v2(plan_metrics_df: pd.DataFrame #qrystring: str, + ) -> pd.DataFrame: + """Formats the output for Tableau to fit the format of + https://github.com/BayAreaMetro/bayarea_urbansim/blob/main/scripts/proximity2transit.py. + + Args: + plan_metrics_df (pd.DataFrame): The DataFrame containing transit proximity metrics. + qrystring (str): A query string to filter the full DataFrame to smaller subsets. + + Returns: + pd.DataFrame: The formatted DataFrame. + """ + logging.debug(f"Formatting for Tableau") + #logging.debug(f"Query string: {qrystring}") + logging.debug(f"Number of metrics: {len(plan_metrics_df)}") + + tableau_column_map = { 'year':'year','modelrun_id':'modelrun_id', + "modelrun_alias": "modelrun_alias", + "area_grouping": "area_grouping", + "transit_grouping": "transit_grouping", + "area_detail": "area_detail", + "service_level_detail": "service_level_detail", + "variable": "type_code", + "value": "transitproximity_majorstop_shareof", + } + # tableau friendly names: high level + tableau_type_code_map = { + "MWTEMPN": "Jobs", + "RETEMPN": "Jobs", + "OTHEMPN":'Jobs', + "HEREMPN":'Jobs', + "FPSEMPN":'Jobs', + "totemp": "Jobs", + "hhq1": "Households", + "tothh": "Households", + } + + # tableau friendly names: detail + tableau_type_subgroup_map = { + "MWTEMPN": "Manufacturing and Warehousing Jobs", + "RETEMPN": "Retail Jobs", + "OTHEMPN":'Government and Information Jobs', + "HEREMPN":'Health, Education and Recreational Jobs', + "FPSEMPN":'Financial and Professional Services Jobs', + + "totemp": "All Jobs", + "hhq1": "Households with Low Incomes", + "tothh": "All Households", + } + + # first, we subset away most of the categories + plan_metrics_df_for_tableau = plan_metrics_df#.query(qrystring) + + logging.debug(f'After filtering of output for tableau: {plan_metrics_df_for_tableau.head()}') + + # then we deal with the schema and domains + metrics_tableau_schema = ( + plan_metrics_df_for_tableau[ + tableau_column_map.keys() + ] + .rename(columns=tableau_column_map) + ) + + # assign group name + metrics_tableau_schema["type_group"] = metrics_tableau_schema.type_code.map( + tableau_type_code_map + ) + + # assign detailed group name + metrics_tableau_schema["type_subgroup"] = metrics_tableau_schema.type_code.map( + tableau_type_subgroup_map + ) + + # prepare output schema + id_cols = ["year","modelrun_id","modelrun_alias", "type_code", "type_group", "type_subgroup"] + id_extra_cols = [ + "area_grouping", + "transit_grouping", + "area_detail", + "service_level_detail", + ] + val_col = ["transitproximity_majorstop_shareof"] + + return metrics_tableau_schema[id_cols + id_extra_cols + val_col] + + +def transit_service_area_share_v2( + rtp: str, + modelrun_alias: str, + modelrun_id: str, + modelrun_data: dict, + output_path: str, + append_output: bool + ): + + + # use later for adding county from fips code + bayareafips = { + "06001": "Alameda", + "06013": "Contra Costa", + "06041": "Marin", + "06055": "Napa", + "06075": "San Francisco", + "06081": "San Mateo", + "06085": "Santa Clara", + "06097": "Sonoma", + "06095": "Solano", + } + + # The transit_scenario_mapping relates the run type to which transit stop buffer universe is appropriate for the tabulation. + # e.g. final blueprint simulation runs should be matched with similar scenario transit stops / headways. + + # Left hand side is modelrunid_alias - we want to relate that to the specific transit stop map to use + # where stops and headways will differ for np and dbp. The parcel crosswalk has different classifications + # for np and dbp. + + transit_scenario_mapping = { + "NoProject": "np", + "No Project": "np", # allowing for variations in input here + "NP":'np', + "Plus": "dbp", + "Final Blueprint": "fbp", + "Draft Blueprint": "dbp", + "DBP": "dbp", + "Alt1": "fbp", + "Alt2": "fbp", + "EIR Alt 1":'fbp', + "EIR Alt 2":'fbp', + "Current": "cur", # refers to existing conditions transit stops + } + + # set Boolean to track if a base year run is already processed + is_baseyear_processed = False + + logging.info(f"Calculating connected for {modelrun_alias} / {modelrun_id}") + logging.debug(f"Modelrun data years: {modelrun_data.keys()}") + logging.debug(f"Modelrun data 2050 datasets: {modelrun_data[2050].keys()}") + + # convenience function for easy groupby percentages + def pct(x): + return x / x.sum() + + # convenience function for indicator specific groupby summaries, with variable groups + def groupby_summaries(df, group_vars): + + grp_summary = ( + df.groupby(group_vars).agg( + agg_mapping + ) + ) + logging.debug( + f'Summarizing parcels with respect to: {"; ".join(group_vars)}') + logging.debug(f"{grp_summary.head()}") + + # this assumed the FIRST group level is the area and not the transit service area one!! + # We are in other words getting the within-group distribution by transit service area + # so, for HRAs, what is the share living in major transit areas; same for the region + + grp_summary_shares = ( + grp_summary + .groupby(level=group_vars[0], + group_keys=False) + .apply(pct) + .round(3) + ) + + return grp_summary_shares + + + # Define columns containing values of interest - more could be added as long as it is present and numeric + val_cols = ["totemp", "RETEMPN", "MWTEMPN","OTHEMPN","HEREMPN","FPSEMPN", "tothh", "hhq1"] + + # use for groupby later to say what do do with each col - we just sum them + agg_mapping = {col: 'sum' for col in val_cols} + + # get a shorthand alias from the often more verbose one + #TODO: this should be fixed on the inventory side - using the description field for details + #modelrun_alias = metrics_utils.classify_runid_alias(modelrun_alias) + + + SUMMARY_YEARS = sorted(modelrun_data.keys()) + + # for storing yearly data + container = {} + + + for year in SUMMARY_YEARS: + + # set boolean for baseyear status + is_baseyear = int(year) in [2015, 2020, 2023] + + if is_baseyear and is_baseyear_processed: + # if both true, we can skip this iteration + logging.info('A baseyear NP run is encountered - we skip.') + continue + + if is_baseyear: + transit_scenario = "cur" # Existing stops buffers for base year + + if modelrun_alias == 'Draft Blueprint': + # Skip Draft Blueprint for base year + logging.info('Skipping baseyear Draft Blueprint run.') + continue + elif modelrun_alias == 'No Project': + is_baseyear_processed = True # Set only for No Project within a base year + logging.info('Encountered a baseyear No Project run - only processing one base year.') + + # continue processing + # get the transit scenario geographies to focus on (e.g., 'fbp' for final bluerprint), 'cur' for current / baseyear conditions + transit_scenario = transit_scenario_mapping.get(modelrun_alias,'dbp') + + # set transit_scenario to cur (existing stops buffers) - this will override any value to 'cur' + if is_baseyear: + # overrides to "cur" if is_baseyear - meaning np only uses np buffers in the future year + transit_scenario = "cur" + if modelrun_alias=='No Project': + logging.info('A baseyear NP run is encountered - setting is_baseyear_processed to True for subsequent skipping. We only need one baseyear run.') + is_baseyear_processed = True + elif modelrun_alias=='Draft Blueprint': + logging.info('A baseyear DBP run is encountered - we skip.') + continue + + + parcel_output = modelrun_data[year]["parcel"] + # report shape of parcel_output df + len_parcels = len(parcel_output) + + logging.debug('Cols of parcels {} in connected func: {}'.format(year,parcel_output.columns)) + logging.debug(f"Parcel output has {len_parcels:,} rows") + + # adding county field from tract ids + parcel_output['county'] = parcel_output.tract20.map(lambda x: f'{x:011.0f}').str.slice(0,5).map(bayareafips).fillna('MISSING') + logging.info(f'counts of parcel by county: {parcel_output['county'].value_counts()}') + + # Identify the passed scenario-specific columns (fbp no project, current) + # this returns different classifications for each - like the 5-way or 6-way service level (cat5, cat6) + # several may be returned depending on how many are in the crosswalk + # we summarize run data for each classification variable + + transit_svcs_cols = parcel_output.filter( + regex=transit_scenario + ).columns.tolist() + + logging.debug( + f'Transit scenario specific for {transit_scenario=} classifier columns: {"; ".join(transit_svcs_cols)}' + ) + + # Fill missing values and convert specific columns to integers for consistency + for col in val_cols: + parcel_output[col] = parcel_output[col].fillna(0).round(0).astype(int) + + # convenience function for easy groupby percentages + + # Now for the summaries - a variable (e.g. tothh) for a geographic area (e.g. region, hra, etc.) + # is distributed by transit service area (e.g. cat5, cat6, etc.): what share of households + # live in major transit areas? what share of jobs "live" in major transit areas? + + + # we define area_vars to hold different areas of interest to use + # for the groupby summaries + if rtp=="RTP2021": + # RTP2021 was tract10-based + area_vars = {'region': "Region", 'tract10_epc': 'CoCs', + 'county':'County', + 'tract10_hra': 'HRAs', 'area_type': 'area_type'} + + parcel_output['tract10_hra'] = parcel_output['tract10_hra'].replace({0:'Not HRA',1:'HRAs'}) + parcel_output['tract10_epc'] = parcel_output['tract10_epc'].replace({0:'Not EPC',1:'EPCs'}) + if rtp=="RTP2025": + area_vars = {'region': "Region", 'tract20_epc': 'CoCs', + 'county':'County', + 'tract20_hra': 'HRAs', 'area_type': 'area_type'} + + parcel_output['tract20_hra'] = parcel_output['tract20_hra'].replace({-1:'Not HRA',0:'Not HRA',1:'HRAs'}) + parcel_output['tract20_epc'] = parcel_output['tract20_epc'].replace({-1:'Not EPC',0:'Not EPC',1:'EPCs'}) + + parcel_output[transit_svcs_cols] = (parcel_output[transit_svcs_cols] + .fillna(0) + .replace({-1:'Outside Major Transit Buffer', + 0:'Outside Major Transit Buffer', + 1:'Inside Major Transit Buffer'})) + + # Add constant useful for constistent handling of groupby summaries + # across the whole region - we can use this as an area grouping variable + # analogous to any other area grouping variable for consistent treatment + parcel_output['region'] = 'Region' + + # main event - loop through combinations of area types and transit service area classifications + for combo in product(*[area_vars, transit_svcs_cols]): + + # run summary on this combination + logging.debug(f"Running parcel summaries for {'-'.join(combo)}") + this_summary_servicelevel = groupby_summaries( + parcel_output, list(combo) + ) + + # new key, including year - making a new tuple + this_key = (year,) + combo + + # store in the dict + container[this_key] = this_summary_servicelevel + logging.debug(f"Summary for {this_key} is {this_summary_servicelevel}") + + + + # after the loop we collect the dict of dataframes into a single dataframe + container_df = pd.concat( + container, + names=[ + "year", + "area_grouping", # i.e. the area concept + # i.e. the classification variable (5-way; 6-way) + "transit_grouping", + "area_detail", # i.e. the area concept domain levels + "service_level_detail", # i.e. the headway / stop type + ], + ) + + # turn to long format series w multi-index + container_df.columns = container_df.columns.set_names('variable') + container_df = container_df.stack() + + logging.debug(f"head for (wide) {container_df.head()}") + + container_df = ( + container_df.round(3) + .reset_index(name="value") + #.rename(columns={f"level_{container_df.index.nlevels-1}": "variable"}) + ) + logging.debug(f"head for (long) {container_df.head()}") + + + container_df["metric"] = "C1" + container_df["modelrun_alias"] = modelrun_alias + container_df["modelrun_id"] = modelrun_id + + + # Finally, clean up area values a bit + + container_df['area_grouping'] = container_df.area_grouping.map(area_vars) + + # basic high level summary - regional totals, major stops only + #qry_string_cat5_basic = 'area_grouping=="Region" & transit_grouping.str.contains("cat5$") & service_level_detail=="majorstop"' + + # detailed version, without limiting to major stops but all headway categories + #qry_string_cat5_detail = 'transit_grouping.str.contains("cat5$") ' + + # look through the two query strings and filter the larger frame as appropriate - a basic and a detailed one + #for det, qrystr in {'basic':qry_string_cat5_basic,'detail':qry_string_cat5_detail}.items(): + + + updated_metrics_tableau_schema = format_for_tableau_v2( + container_df + ) + + filename = f"metrics_connected1_transitproximity_v2.csv" + filepath = output_path / filename + + updated_metrics_tableau_schema.to_csv(filepath, mode='a' if append_output else 'w', header=False if append_output else True, index=False) + logging.info("{} {:,} lines to {}".format("Appended" if append_output else "Wrote", len(updated_metrics_tableau_schema), filepath)) + diff --git a/scripts/metrics/metrics_lu_standalone.py b/scripts/metrics/metrics_lu_standalone.py index 61e86b55..43d54936 100644 --- a/scripts/metrics/metrics_lu_standalone.py +++ b/scripts/metrics/metrics_lu_standalone.py @@ -142,7 +142,10 @@ def main(): # directory is relative to MODEL_RUNS_DIR run_directory_path = MODEL_RUNS_DIR / row['directory'] - modelrun_alias = row['alias'] + + # switch to 'variant' column which is less verbose with respect to "No Project [some detail]" + #modelrun_alias = row['alias'] + modelrun_alias = row['variant'] modelrun_id = row['directory'] logging.info(f"Processing run modelrun_alias:[{modelrun_alias}] modelrun_id:[{modelrun_id}] run_directory_path:{run_directory_path}") @@ -223,7 +226,7 @@ def main(): args.rtp, modelrun_alias, modelrun_id, BOX_DIR, OUTPUT_PATH, append_output) if (args.only == None) or (args.only == 'connected'): - metrics_connected.transit_service_area_share( + metrics_connected.transit_service_area_share_v2( args.rtp, modelrun_alias, modelrun_id, modelrun_data, OUTPUT_PATH, append_output) if (args.only == None) or (args.only == 'healthy'): diff --git a/scripts/metrics/metrics_utils.py b/scripts/metrics/metrics_utils.py index d38918ff..7824b78a 100644 --- a/scripts/metrics/metrics_utils.py +++ b/scripts/metrics/metrics_utils.py @@ -3,6 +3,7 @@ import logging from datetime import datetime import pathlib +import getpass import os # make global so we only read once @@ -12,6 +13,7 @@ rtp2025_transit_service_df = pd.DataFrame() # parcel -> transit service rtp2025_taz_crosswalk_df = pd.DataFrame() # taz1 -> epc +rtp2025_parcel_taz_crosswalk_df = pd.DataFrame() # parcel -> taz1 rtp2025_np_parcel_inundation_df = pd.DataFrame() # parcel -> parcel sea level rise inundation rtp2025_dbp_parcel_inundation_df = pd.DataFrame() # parcel -> parcel sea level rise inundation @@ -55,7 +57,12 @@ # set the path for M: drive # from OSX, M:/ may be mounted to /Volumes/Data/Models M_DRIVE = pathlib.Path("/Volumes/Data/Models") if os.name != "nt" else pathlib.Path("M:/") - +USERNAME = getpass.getuser() +HOME_DIR = pathlib.Path.home() +if USERNAME.lower() in ['lzorn']: + BOX_DIR = pathlib.Path("E:/Box") +else: + BOX_DIR = HOME_DIR / 'Box' # -------------------------------------- # Data Loading Based on Model Run Plan @@ -96,6 +103,8 @@ def load_data_for_runs( global rtp2025_urban_area_crosswalk_df global rtp2025_transit_service_df global rtp2025_taz_crosswalk_df + + global rtp2025_parcel_taz_crosswalk_df global rtp2025_np_parcel_inundation_df global rtp2025_dbp_parcel_inundation_df @@ -130,11 +139,27 @@ def load_data_for_runs( logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_urban_area_crosswalk_df), URBAN_AREA_CROSSWALK_FILE)) logging.debug(" rtp2025_urban_area_crosswalk_df.head():\n{}".format(rtp2025_urban_area_crosswalk_df.head())) - # transit service areas + # transit service areas (used through April 2024 - with n-category transit service areas including headway differentiation) + # We used with transit_service_area_share_v2(). + + # if len(rtp2025_transit_service_df) == 0: + # import geopandas as gpd + # PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet" + # rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE) + # transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5'] + # rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep] + # logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE)) + # logging.debug(" rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head())) + + # simpler version with binary 1/0 classification instead of headway differentiation. We use with transit_service_area_share_v2(). + if len(rtp2025_transit_service_df) == 0: - PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet" - rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE) - transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5'] + import geopandas as gpd + PARCEL_TRANSITSERVICE_FILE = BOX_DIR / 'Plan Bay Area 2050+' / 'Blueprint' / \ + 'Draft Blueprint Modeling and Metrics' / \ + 'transportation' / "p10_x_transit_area_identity.csv" + rtp2025_transit_service_df = pd.read_csv(PARCEL_TRANSITSERVICE_FILE) + transit_cols_keep = ['parcel_id','cur','np', 'dbp'] rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep] logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE)) logging.debug(" rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head())) @@ -241,6 +266,7 @@ def load_data_for_runs( logging.debug("final rtp2025_tract_crosswalk_df.dtypes():\n{}".format(rtp2025_tract_crosswalk_df.dtypes)) # columns are: parcel_id, tract10, tract20, tract20_epc, tract20_growth_geo, tract20_tra, tract20_hra, tract10_DispRisk + if len(rtp2025_taz_crosswalk_df) == 0: # taz-based lookups @@ -249,6 +275,43 @@ def load_data_for_runs( logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_taz_crosswalk_df), TAZ_EPC_CROSSWALK_FILE)) logging.debug(" rtp2025_taz_crosswalk_df.head():\n{}".format(rtp2025_taz_crosswalk_df.head())) + if len(rtp2025_parcel_taz_crosswalk_df)==0: + + # parcels to taz crosswalk - we need this for the area_type (suburban/urban/rural) taz-based classification + + PARCEL_TAZ_CROSSWALK_FILE = M_DRIVE / "urban_modeling" / "baus" / "BAUS Inputs" / "basis_inputs" / "crosswalks" / "2020_08_17_parcel_to_taz1454sub.csv" + rtp2025_parcel_taz_crosswalk_df = pd.read_csv(PARCEL_TAZ_CROSSWALK_FILE, usecols=['PARCEL_ID', 'ZONE_ID']) + rtp2025_parcel_taz_crosswalk_df.columns = rtp2025_parcel_taz_crosswalk_df.columns.str.lower() + logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_parcel_taz_crosswalk_df), PARCEL_TAZ_CROSSWALK_FILE)) + logging.debug(" rtp2025_parcel_taz_crosswalk_df.head():\n{}".format(rtp2025_parcel_taz_crosswalk_df.head())) + + # taz-based lookups to area_type (urban/suburban/rural) + TAZ_AREATYPE_CROSSWALK_FILE = METRICS_DIR / "metrics_input_files" / "taz_urban_suburban.csv" + rtp2025_taz_areatype_crosswalk_df = pd.read_csv(TAZ_AREATYPE_CROSSWALK_FILE, usecols=['TAZ1454','area_type']) + logging.info(" Read {:,} rows from taz areatype crosswalk {}".format(len(rtp2025_taz_areatype_crosswalk_df), TAZ_AREATYPE_CROSSWALK_FILE)) + logging.debug(" rtp2025_taz_areatype_crosswalk_df.head():\n{}".format(rtp2025_taz_areatype_crosswalk_df.head())) + + + rtp2025_parcel_taz_crosswalk_df = pd.merge( + left = rtp2025_parcel_taz_crosswalk_df, + right = rtp2025_taz_areatype_crosswalk_df, + left_on = 'zone_id', + right_on = 'TAZ1454', + how = 'left', + validate = 'many_to_one', + indicator=True + ) + logging.debug("rtp2025_parcel_taz_crosswalk_df._merge.value_counts():\n{}".format( + rtp2025_parcel_taz_crosswalk_df._merge.value_counts())) + rtp2025_parcel_taz_crosswalk_df.drop(columns=['_merge'], inplace=True) + + # fillna with zero + rtp2025_parcel_taz_crosswalk_df.fillna(0, inplace=True) + + logging.debug("rtp2025_parcel_taz_crosswalk_df.head():\n{}".format(rtp2025_parcel_taz_crosswalk_df)) + logging.debug("rtp2025_parcel_taz_crosswalk_df.dtypes():\n{}".format(rtp2025_parcel_taz_crosswalk_df.dtypes)) + + if len(rtp2025_np_parcel_inundation_df) == 0: PARCEL_INUNDATION_FILE = METRICS_DIR / "metrics_input_files" / "slr_parcel_inundation_PBA50Plus_NP.csv" rtp2025_np_parcel_inundation_df = pd.read_csv(PARCEL_INUNDATION_FILE) @@ -355,11 +418,27 @@ def load_data_for_runs( logging.debug("final rtp2021_tract_crosswalk_df.head():\n{}".format(rtp2021_tract_crosswalk_df)) # columns are: parcel_id, tract10, tract10_epc, tract10_DispRisk, tract10_hra, tract10_growth_geo, tract10_tra - # transit service areas # works for both RTP2021 and RTP2025 + # transit service areas (used through April 2024 - with n-category transit service areas including headway differentiation) + # We used with transit_service_area_share_v2(). + + # if len(rtp2025_transit_service_df) == 0: + # import geopandas as gpd + # PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet" + # rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE) + # transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5'] + # rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep] + # logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE)) + # logging.debug(" rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head())) + + # simpler version with binary 1/0 classification instead of headway differentiation. We use with transit_service_area_share_v2(). + if len(rtp2025_transit_service_df) == 0: - PARCEL_TRANSITSERVICE_FILE = M_DRIVE / "Data" / "GIS layers" / "JobsHousingTransitProximity" / "update_2024" / "outputs" / "p10_topofix_classified.parquet" - rtp2025_transit_service_df = pd.read_parquet(PARCEL_TRANSITSERVICE_FILE) - transit_cols_keep = ['PARCEL_ID','area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5'] + import geopandas as gpd + PARCEL_TRANSITSERVICE_FILE = BOX_DIR / 'Plan Bay Area 2050+' / 'Blueprint' / \ + 'Draft Blueprint Modeling and Metrics' / \ + 'transportation' / "p10_x_transit_area_identity.csv" + rtp2025_transit_service_df = pd.read_csv(PARCEL_TRANSITSERVICE_FILE) + transit_cols_keep = ['parcel_id','cur','np', 'dbp'] rtp2025_transit_service_df = rtp2025_transit_service_df[transit_cols_keep] logging.info(" Read {:,} rows from crosswalk {}".format(len(rtp2025_transit_service_df), PARCEL_TRANSITSERVICE_FILE)) logging.debug(" rtp2025_transit_service_df.head():\n{}".format(rtp2025_transit_service_df.head())) @@ -476,13 +555,27 @@ def load_data_for_runs( left = parcel_df, right = rtp2025_transit_service_df, how = "left", - left_on = "parcel_id", - right_on ="PARCEL_ID", + on = "parcel_id", + #right_on = "PARCEL_ID", # not needed with the current crosswalk validate = "one_to_one" ) logging.debug("parcel_df.dtypes:\n{}".format(parcel_df.dtypes)) - logging.debug("Head after merge with rtp2025_tract_crosswalk_df:\n{}".format(parcel_df.head())) + logging.debug("Head after merge with rtp2025_transit_service_df:\n{}".format(parcel_df.head())) + + # add area_type (urban/suburban/rural) lookups + parcel_df = pd.merge( + left = parcel_df, + right = rtp2025_parcel_taz_crosswalk_df, + how = "left", + on = "parcel_id", + #right_on = "PARCEL_ID", # not needed with the current crosswalk + validate = "one_to_one" + ) + + logging.debug("parcel_df.dtypes:\n{}".format(parcel_df.dtypes)) + logging.debug("Head after merge with rtp2025_parcel_taz_crosswalk_df:\n{}".format(parcel_df.head())) + # add parcel lookup for 2020 urban area footprint parcel_df = pd.merge( @@ -550,8 +643,8 @@ def load_data_for_runs( left = parcel_df, right = rtp2025_transit_service_df, how = "left", - left_on = "parcel_id", - right_on ="PARCEL_ID", + on = "parcel_id", + #right_on ="PARCEL_ID", validate = "one_to_one" ) @@ -598,11 +691,16 @@ def load_data_for_runs( 'MWTEMPN', 'RETEMPN', 'FPSEMPN', 'HEREMPN', 'OTHEMPN', # tract-level columns 'tract10_epc', 'tract10_DispRisk', 'tract10_hra', 'tract10_growth_geo', 'tract10_tra', + # transit-related columns - 'area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5', + #'area_type','Service_Level_np_cat5', 'Service_Level_fbp_cat5', 'Service_Level_current_cat5', + + # use after may 3 2024 + 'np','cur','dbp', + # sea level rise column "inundation"] - + parcel_df = parcel_df[columns_to_keep] logging.debug("parcel_df:\n{}".format(parcel_df.head(30)))