Skip to content

Commit

Permalink
Merge pull request #1082 from cal-itp/stage-xml
Browse files Browse the repository at this point in the history
fix duplicates in gtfs data availability, xml for April 2024 open data
  • Loading branch information
tiffanychu90 authored Apr 24, 2024
2 parents 92dc961 + 24ea1e3 commit e9d8466
Show file tree
Hide file tree
Showing 14 changed files with 49 additions and 589 deletions.
2 changes: 2 additions & 0 deletions gtfs_digest/_section2_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
Operator Level
"""
def trips_by_gtfs(df):
df = df.loc[df.time_period=="all_day"]

by_date_category = (
pd.crosstab(
df.service_date,
Expand Down
38 changes: 22 additions & 16 deletions gtfs_digest/merge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def concatenate_crosswalk_organization(

def merge_in_standardized_route_names(
df: pd.DataFrame,
set_typology: bool
) -> pd.DataFrame:

keep_cols = [
Expand Down Expand Up @@ -159,27 +160,30 @@ def merge_in_standardized_route_names(
on = ["schedule_gtfs_dataset_key",
"route_id", "service_date"],
how = "left",
)
).drop_duplicates()

if set_typology:
primary_typology_df = set_primary_typology(df2)

# primary typology
primary_typology_df = set_primary_typology(df2)
df2 = pd.merge(
df2,
primary_typology_df,
on = route_time_cols,
how = "inner"
)

df3 = pd.merge(
df2,
primary_typology_df,
on = route_time_cols,
how = "inner"
)
# Clean up

# After merging, we can replace route_id with recent_route_id2
drop_cols = ["route_desc", "combined_name", "route_id2"] + [
c for c in df3.columns if "is_" in c and
c for c in df2.columns if "is_" in c and
c not in ["is_early", "is_late", "is_ontime"]
]
df4 = time_series_utils.parse_route_combined_name(df3).drop(
columns = drop_cols)

return df4

df3 = time_series_utils.parse_route_combined_name(df2).drop(
columns = drop_cols).drop_duplicates().reset_index(drop=True)

return df3


def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -260,7 +264,8 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
sched_rt_category = df.sched_rt_category.map(
gtfs_schedule_wrangling.sched_rt_category_dict)
).pipe(
merge_in_standardized_route_names
merge_in_standardized_route_names,
set_typology=True
).merge(
df_crosswalk,
on = ["schedule_gtfs_dataset_key", "name", "service_date"],
Expand All @@ -283,7 +288,8 @@ def set_primary_typology(df: pd.DataFrame) -> pd.DataFrame:
segment_speeds = concatenate_segment_speeds_by_route_direction(
analysis_date_list
).pipe(
merge_in_standardized_route_names
merge_in_standardized_route_names,
set_typology=False
).astype({"direction_id": "int64"}) #Int64 doesn't work for gdf

utils.geoparquet_gcs_export(
Expand Down
6 changes: 3 additions & 3 deletions gtfs_digest/merge_operator_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def concatenate_operator_stats(
date_list: list
) -> pd.DataFrame:
FILE = GTFS_DATA_DICT.digest_tables.operator_scheduled_stats
FILE = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

df = time_series_utils.concatenate_datasets_across_dates(
SCHED_GCS,
Expand All @@ -31,7 +31,7 @@ def concatenate_operator_stats(
def concatenate_operator_routes(
date_list: list
) -> gpd.GeoDataFrame:
FILE = GTFS_DATA_DICT.operator_routes.operator_routes
FILE = GTFS_DATA_DICT.schedule_tables.operator_routes

df = time_series_utils.concatenate_datasets_across_dates(
SCHED_GCS,
Expand Down Expand Up @@ -118,7 +118,7 @@ def operator_category_counts_by_date() -> pd.DataFrame:

gdf = concatenate_operator_routes(
analysis_date_list
).pipe(merge_in_standardized_route_names)
).pipe(merge_in_standardized_route_names, set_typology=False)

utils.geoparquet_gcs_export(
gdf,
Expand Down
Loading

0 comments on commit e9d8466

Please sign in to comment.