Merge pull request #1082 from cal-itp/stage-xml

fix duplicates in gtfs data availability, xml for April 2024 open data
cal-itp · Apr 24, 2024 · e9d8466 · e9d8466
2 parents 92dc961 + 24ea1e3
commit e9d8466
Show file tree

Hide file tree

Showing 14 changed files with 49 additions and 589 deletions.
diff --git a/gtfs_digest/_section2_utils.py b/gtfs_digest/_section2_utils.py
@@ -205,6 +205,8 @@ def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
 Operator Level
 """
 def trips_by_gtfs(df):
+    df = df.loc[df.time_period=="all_day"]
+
     by_date_category = (
     pd.crosstab(
         df.service_date,

diff --git a/gtfs_digest/merge_data.py b/gtfs_digest/merge_data.py
@@ -129,6 +129,7 @@ def concatenate_crosswalk_organization(
 
 def merge_in_standardized_route_names(
     df: pd.DataFrame, 
+    set_typology: bool
 ) -> pd.DataFrame:
 
     keep_cols = [
@@ -159,27 +160,30 @@ def merge_in_standardized_route_names(
         on = ["schedule_gtfs_dataset_key", 
               "route_id", "service_date"],
         how = "left",
-    )
+    ).drop_duplicates()
+
+    if set_typology:
+        primary_typology_df = set_primary_typology(df2)
 
-    # primary typology
-    primary_typology_df = set_primary_typology(df2)
+        df2 = pd.merge(
+            df2,
+            primary_typology_df,
+            on = route_time_cols,
+            how = "inner"
+        )
 
-    df3 = pd.merge(
-        df2,
-        primary_typology_df,
-        on = route_time_cols,
-        how = "inner"
-    )
+    # Clean up
 
     # After merging, we can replace route_id with recent_route_id2 
     drop_cols = ["route_desc", "combined_name", "route_id2"] + [ 
-        c for c in df3.columns if "is_" in c and 
+        c for c in df2.columns if "is_" in c and 
         c not in ["is_early", "is_late", "is_ontime"]
     ] 
-    df4 = time_series_utils.parse_route_combined_name(df3).drop(
-        columns = drop_cols)
-
-    return df4
+
+    df3 = time_series_utils.parse_route_combined_name(df2).drop(
+        columns = drop_cols).drop_duplicates().reset_index(drop=True)
+
+    return df3
 
 
 def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
@@ -260,7 +264,8 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
         sched_rt_category = df.sched_rt_category.map(
             gtfs_schedule_wrangling.sched_rt_category_dict)
     ).pipe(
-        merge_in_standardized_route_names
+        merge_in_standardized_route_names,
+        set_typology=True
     ).merge(
         df_crosswalk,
         on = ["schedule_gtfs_dataset_key", "name", "service_date"],
@@ -283,7 +288,8 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
     segment_speeds = concatenate_segment_speeds_by_route_direction(
         analysis_date_list
     ).pipe(
-        merge_in_standardized_route_names
+        merge_in_standardized_route_names, 
+        set_typology=False
     ).astype({"direction_id": "int64"}) #Int64 doesn't work for gdf
 
     utils.geoparquet_gcs_export(

diff --git a/gtfs_digest/merge_operator_data.py b/gtfs_digest/merge_operator_data.py
@@ -16,7 +16,7 @@
 def concatenate_operator_stats(
     date_list: list
 ) -> pd.DataFrame:
-    FILE = GTFS_DATA_DICT.digest_tables.operator_scheduled_stats
+    FILE = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats
 
     df = time_series_utils.concatenate_datasets_across_dates(
         SCHED_GCS,
@@ -31,7 +31,7 @@ def concatenate_operator_stats(
 def concatenate_operator_routes( 
     date_list: list
 ) -> gpd.GeoDataFrame:
-    FILE = GTFS_DATA_DICT.operator_routes.operator_routes
+    FILE = GTFS_DATA_DICT.schedule_tables.operator_routes
 
     df = time_series_utils.concatenate_datasets_across_dates(
         SCHED_GCS,
@@ -118,7 +118,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:
 
     gdf = concatenate_operator_routes(
         analysis_date_list
-    ).pipe(merge_in_standardized_route_names)
+    ).pipe(merge_in_standardized_route_names, set_typology=False)
 
     utils.geoparquet_gcs_export(
         gdf,