Skip to content

Commit

Permalink
enable utils_format.py and load.gtfsfeed_to_df to complete successful…
Browse files Browse the repository at this point in the history
…ly (#13)

* incl service_id in trips for cal dates step

* notes, todo comments, and a helper function to replace the various string elements of agency id

* use _generate_unique_agency_id in all related steps in all helper functions

* more todo notiations and final df_list update

* todo re: df appending in load.py

* note about assertions in headways

* remove todo tag and implement single run of subsetting dfs instead of multiple times per comment pull/13#discussion_r112552991

* remove todo tag and implement single run of subsetting dfs instead of multiple times per comment pull/13#discussion_r112552991
  • Loading branch information
kuanb authored and sablanchard committed Apr 20, 2017
1 parent e308e1a commit e13406e
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 42 deletions.
5 changes: 5 additions & 0 deletions urbanaccess/gtfs/headways.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,15 @@ def headways(gtfsfeeds_df,headway_timerange):
route stop headways in units of minutes
with relevant route and stop information
"""

# TODO: Assertions in code during runtime should be handled in some other way.
# For example, class descriptors may be appropriate
# (http://stackoverflow.com/questions/944592/best-practice-for-python-assert)
time_error_statement = ('{} starttime and endtime are not in the correct format. '
'Format should be 24 hour clock in following format: 08:00:00 or 17:00:00'.format(headway_timerange))
assert isinstance(headway_timerange,list) and len(headway_timerange) == 2, time_error_statement
assert headway_timerange[0] < headway_timerange[1], time_error_statement

for t in headway_timerange:
assert isinstance(t,str), time_error_statement
assert len(t) == 8, time_error_statement
Expand Down
3 changes: 3 additions & 0 deletions urbanaccess/gtfs/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ def gtfsfeed_to_df(gtfsfeed_path=None,validation=False,verbose=True,bbox=None,re
trips_df=trips_df[['trip_id','route_id']],
info_to_append='route_type_to_stop_times')

# TODO: Appending seems improper; given the initial dataframes are empty we should either override or
# otherwise replace. Appending leaves the opportunity open for accidental repeats runs of this function to
# produce unintended side effects (double long dataframes, etc.).
merged_stops_df = merged_stops_df.append(stops_df,ignore_index=True)
merged_routes_df = merged_routes_df.append(routes_df,ignore_index=True)
merged_trips_df = merged_trips_df.append(trips_df,ignore_index=True)
Expand Down
134 changes: 92 additions & 42 deletions urbanaccess/gtfs/utils_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ def calendar_dates_agencyid(calendar_dates_df=None, routes_df=None,trips_df=None
tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False)
merged_df = pd.merge(calendar_dates_df, tmp2, how='left', on='service_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()

merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
merged_df.drop_duplicates(subset='service_id', keep='first', inplace=True)

merged_df = pd.merge(calendar_dates_df, merged_df[['unique_agency_id', 'service_id']], how='left',
Expand Down Expand Up @@ -237,8 +238,8 @@ def calendar_agencyid(calendar_df=None,routes_df=None,trips_df=None,agency_df=No
"""
tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False)
merged_df = pd.merge(calendar_df['service_id'], tmp2, how='left', on='service_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()
merged_df = pd.merge(calendar_df[['service_id']], tmp2, how='left', on='service_id', sort=False, copy=False)
merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
merged_df.drop_duplicates(subset='service_id', keep='first', inplace=True)

merged_df = pd.merge(calendar_df, merged_df[['unique_agency_id', 'service_id']], how='left',
Expand All @@ -265,7 +266,7 @@ def trips_agencyid(trips_df=None,routes_df=None, agency_df=None):
"""
tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
merged_df = pd.merge(trips_df[['trip_id', 'route_id']], tmp1, how='left', on='route_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()
merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
merged_df.drop_duplicates(subset='trip_id', keep='first', inplace=True)

merged_df = pd.merge(trips_df, merged_df[['unique_agency_id', 'trip_id']], how='left', on='trip_id',
Expand Down Expand Up @@ -297,7 +298,7 @@ def stops_agencyid(stops_df=None, trips_df=None, routes_df=None,stop_times_df=No
tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False)
tmp3 = pd.merge(stop_times_df, tmp2, how='left', on='trip_id', sort=False, copy=False)
merged_df = pd.merge(stops_df[['stop_id']], tmp3, how='left', on='stop_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()
merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
merged_df.drop_duplicates(subset='stop_id', keep='first', inplace=True)

merged_df = pd.merge(stops_df, merged_df[['unique_agency_id', 'stop_id']], how='left', on='stop_id',
Expand All @@ -320,7 +321,7 @@ def routes_agencyid(routes_df=None, agency_df=None):
merged_df : pandas.DataFrame
"""
merged_df = pd.merge(routes_df[['route_id', 'agency_id']], agency_df, how='left', on='agency_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()
merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')

merged_df = pd.merge(routes_df, merged_df[['unique_agency_id', 'route_id']], how='left', on='route_id',
sort=False, copy=False)
Expand All @@ -346,9 +347,9 @@ def stop_times_agencyid(stop_times_df=None, routes_df=None,trips_df=None, agency
merged_df : pandas.DataFrame
"""
tmp1 = pd.merge(routes_df, agency_df, how='left', on='agency_id', sort=False, copy=False)
tmp2 = pd.merge(trips_df['trip_id'], tmp1, how='left', on='route_id', sort=False, copy=False)
tmp2 = pd.merge(trips_df[['trip_id', 'route_id']], tmp1, how='left', on='route_id', sort=False, copy=False)
merged_df = pd.merge(stop_times_df, tmp2, how='left', on='trip_id', sort=False, copy=False)
merged_df['unique_agency_id'] = sub(r'\s+', '_', merged_df['agency_name']).str.replace('&','and').lower()
merged_df['unique_agency_id'] = _generate_unique_agency_id(merged_df, 'agency_name')
merged_df.drop_duplicates(subset='trip_id', keep='first',inplace=True)

merged_df = pd.merge(stop_times_df, merged_df[['unique_agency_id','trip_id']], how='left', on='trip_id', sort=False, copy=False)
Expand Down Expand Up @@ -388,12 +389,16 @@ def add_unique_agencyid(agency_df=None,stops_df=None,routes_df=None,trips_df=Non

df_list = [stops_df,routes_df,trips_df,stop_times_df,calendar_df,calendar_dates_df]

if ((os.path.exists(os.path.join(feed_folder,'agency.txt')) == False or
'agency_id' not in agency_df.columns) and
nulls_as_folder == True):
for df in df_list:
path_absent = os.path.exists(os.path.join(feed_folder,'agency.txt')) == False
agency_absent = 'agency_id' not in agency_df.columns
if ((path_absent or agency_absent) and nulls_as_folder == True):

for index, df in enumerate(df_list):
# TODO: We seem to be repeating this pattern in a number of places - either do it once or use a helper function
unique_agency_id = sub(r'\s+', '_', os.path.split(feed_folder)[1]).replace('&','and').lower()
df['unique_agency_id'] = unique_agency_id
df_list[index] = df

log('The agency.txt or agency_id column was not found. The unique agency id: {} was generated using the name of the folder containing the GTFS feed text files.'.format(unique_agency_id))

elif os.path.exists(os.path.join(feed_folder,'agency.txt')) == False and nulls_as_folder == False:
Expand All @@ -406,53 +411,89 @@ def add_unique_agencyid(agency_df=None,stops_df=None,routes_df=None,trips_df=Non

if len(agency_df['agency_name']) == 1:
assert agency_df['agency_name'].isnull().values == False

# TODO: Again, this need to be moved into a helper function
unique_agency_id = sub(r'\s+', '_', agency_df['agency_name'][0]).replace('&','and').lower()
for df in df_list:

for index, df in enumerate(df_list):
df['unique_agency_id'] = unique_agency_id
df_list[index] = df
log('The unique agency id: {} was generated using the name of the agency in the agency.txt file.'.format(unique_agency_id))

elif len(agency_df['agency_name']) > 1:
# TODO: Assertions shouldn't be in runtime - validation should
# either be prior to model execution or handled gracefully
# through caught errors/exceptions
assert agency_df[['agency_id','agency_name']].isnull().values.any() == False

calendar_dates_df = calendar_dates_agencyid(calendar_dates_df=calendar_dates_df,
routes_df=routes_df[['route_id', 'agency_id']],
trips_df=trips_df[['trip_id', 'route_id']],
agency_df=agency_df[['agency_id','agency_name']])

calendar_df = calendar_agencyid(calendar_df=calendar_df,
routes_df=routes_df[['route_id', 'agency_id']],
trips_df=trips_df[['trip_id', 'route_id']],
agency_df=agency_df[['agency_id','agency_name']])
trips_df = trips_agencyid(trips_df=trips_df,
routes_df=routes_df[['route_id', 'agency_id']],
agency_df=agency_df[['agency_id','agency_name']])

stops_df = stops_agencyid(stops_df=stops_df,
trips_df=trips_df[['trip_id', 'route_id']],
routes_df=routes_df[['route_id', 'agency_id']],
stop_times_df=stop_times_df[['trip_id', 'stop_id']],
agency_df=agency_df[['agency_id','agency_name']])

routes_df = routes_agencyid(routes_df=routes_df,
agency_df=agency_df[['agency_id','agency_name']])

stop_times_df = stop_times_agencyid(stop_times_df=stop_times_df,
routes_df=routes_df[['route_id', 'agency_id']],
trips_df=trips_df[['trip_id', 'route_id']],
agency_df=agency_df[['agency_id','agency_name']])

# only generate subset dataframes once, instead of for each keyword argument
# in the below helper function
subset_agency_df = agency_df[['agency_id','agency_name']]
subset_routes_df = routes_df[['route_id', 'agency_id']]
subset_stop_times_df = stop_times_df[['trip_id', 'stop_id']]
subset_trips_df = trips_df[['trip_id', 'route_id']]
subset_trips_df_w_sid = trips_df[['trip_id', 'route_id', 'service_id']]

# TODO: In each of the steps, the functions foo_agencyid ought be prepended with an underscore (e.g.
# foo_agencyid() to _foo_agencyid()) in order to signify that these are helper functions for this
# step, and not exported out of this .py file
calendar_dates_replacement_df = calendar_dates_agencyid(
calendar_dates_df=calendar_dates_df,
routes_df=subset_routes_df,
trips_df=subset_trips_df_w_sid,
agency_df=subset_agency_df)

calendar_replacement_df = calendar_agencyid(
calendar_df=calendar_df,
routes_df=subset_routes_df,
trips_df=subset_trips_df_w_sid,
agency_df=subset_agency_df)

trips_replacement_df = trips_agencyid(
trips_df=trips_df,
routes_df=subset_routes_df,
agency_df=subset_agency_df)

stops_replacement_df = stops_agencyid(
stops_df=stops_df,
trips_df=subset_trips_df,
routes_df=subset_routes_df,
stop_times_df=subset_stop_times_df,
agency_df=subset_agency_df)

routes_replacement_df = routes_agencyid(
routes_df=routes_df,
agency_df=subset_agency_df)

stop_times_replacement_df = stop_times_agencyid(
stop_times_df=stop_times_df,
routes_df=subset_routes_df,
trips_df=subset_trips_df,
agency_df=subset_agency_df)

# need to update the df_list object with these new variable overrides
df_list = [stops_replacement_df,
routes_replacement_df,
trips_replacement_df,
stop_times_replacement_df,
calendar_replacement_df,
calendar_dates_replacement_df]

log('agency.txt agency_name column has more than one agency name listed. Unique agency id was assigned using the agency id and associated agency name.')

for df in df_list:
for index, df in enumerate(df_list):
if df['unique_agency_id'].isnull().values.any():
# TODO: These string conversions seem to follow a pattern, could be part of the helper function?
unique_agency_id = sub(r'\s+', '_', os.path.split(feed_folder)[1]).replace('&','and').lower()

df['unique_agency_id'].fillna(''.join(['multiple_operators_', unique_agency_id]), inplace=True)
log('There are {} null values ({}% of total) without a unique agency id. '
'These records will be labeled as multiple_operators_ with the GTFS file folder '
'name'.format(df['unique_agency_id'].isnull().sum(),len(df),round((float(df['unique_agency_id'].isnull().sum()) / float(len(df)) *100))))
df_list[index] = df

log('Unique agency id operation complete. Took {:,.2f} seconds'.format(time.time()-start_time))
return stops_df,routes_df,trips_df,stop_times_df,calendar_df,calendar_dates_df
return df_list

def timetoseconds(df=None,time_cols=None):
"""
Expand Down Expand Up @@ -691,3 +732,12 @@ def append_route_type(stops_df=None, stop_times_df=None,routes_df=None,trips_df=

return stop_times_df

# helper/utility functions
def _generate_unique_agency_id(df, col_name):
col = df[col_name].astype(str)
# replace all runs of spaces with a single underscore
col_snake_case = col.str.replace(r'\s+', '_')
# replace all ampersands
col_snake_no_amps = col_snake_case.str.replace('&','and')
return col_snake_no_amps.str.lower()

0 comments on commit e13406e

Please sign in to comment.