From 69c7e8c55260c1c4c7eebdd697c227caa82d74ba Mon Sep 17 00:00:00 2001 From: Kuan Butts Date: Thu, 20 Apr 2017 14:33:47 -0700 Subject: [PATCH 1/4] need to reset index and drop index name if exists in col of dataframe --- urbanaccess/gtfs/network.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index a056b59..71e49de 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -290,8 +290,14 @@ def interpolatestoptimes(stop_times_df, calendar_selected_trips_df, day): df_for_interpolation['stop_sequence_merge'] = ( df_for_interpolation[~trailing]['stop_sequence']) + # Need to check if existing index in column names and drop if so (else + # a ValueError where Pandas can't insert b/c col already exists will occur) + drop_bool = False + if _check_if_index_name_in_cols(df_for_interpolation): + drop_bool = True + df_for_interpolation.reset_index(inplace=True, drop=drop_bool) + # Merge back into original index - df_for_interpolation.reset_index(inplace=True) interpolated_df = pd.merge(df_for_interpolation, melted, 'left', on=['stop_sequence_merge', 'unique_trip_id']) interpolated_df.set_index('index', inplace=True) @@ -763,4 +769,10 @@ def load_processed_gtfs_data(dir=config.settings.data_folder,filename=None): if 'calendar_dates' in store.keys(): gtfsfeeds_df.calendar_dates = hdf5_to_df(dir=dir,filename=filename,key='calendar_dates') - return gtfsfeeds_df \ No newline at end of file + return gtfsfeeds_df + +# helper functions +def _check_if_index_name_in_cols(df): + cols = df.columns.values + iname = df.index.name + return (iname in cols) From b7c32d69e91b3f3f36e827f0bd65382b459a4e82 Mon Sep 17 00:00:00 2001 From: Kuan Butts Date: Thu, 20 Apr 2017 15:30:25 -0700 Subject: [PATCH 2/4] set index if _check_if_index_name_in_cols true to ensure no snag on line 306 --- urbanaccess/gtfs/network.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 71e49de..f1f53c6 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -292,8 +292,11 @@ def interpolatestoptimes(stop_times_df, calendar_selected_trips_df, day): # Need to check if existing index in column names and drop if so (else # a ValueError where Pandas can't insert b/c col already exists will occur) - drop_bool = False if _check_if_index_name_in_cols(df_for_interpolation): + # move the current index to own col named 'index' + col_name_to_copy = df_for_interpolation.index.name + col_to_copy = df_for_interpolation[col_name_to_copy].copy() + df_for_interpolation['index'] = col_to_copy drop_bool = True df_for_interpolation.reset_index(inplace=True, drop=drop_bool) @@ -303,9 +306,19 @@ def interpolatestoptimes(stop_times_df, calendar_selected_trips_df, day): interpolated_df.set_index('index', inplace=True) interpolated_times = interpolated_df[['departure_time_sec_interpolate']] - final_stop_times_df = pd.merge(stop_times_df, interpolated_times, - how='left', left_index=True, - right_index=True, sort=False, copy=False) + # default value for final_stop_times + final_stop_times_df = stop_times_df + + # if empty just duplicate departure_time_sec col + if interpolated_times.empty: + departures = final_stop_times_df['departure_time_sec'].copy() + final_stop_times_df['departure_time_sec_interpolate'] = departures + + # if df not empty, override the default final_stop_times with merge result + else: + final_stop_times_df = pd.merge(stop_times_df, interpolated_times, + how='left', left_index=True, + right_index=True, sort=False, copy=False) # fill in nulls in interpolated departure time column using trips that did not need interpolation in order to create # one column with both original and interpolated times From 79791899dbd7ebefe1f84542a5ef423254e3f958 Mon Sep 17 00:00:00 2001 From: Kuan Butts Date: Thu, 20 Apr 2017 15:33:41 -0700 Subject: [PATCH 3/4] dropped interpolated_times empty check, it over complicates the function just to skip the merge step, when there really is not significant perf gains to be achieved through this --- urbanaccess/gtfs/network.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index f1f53c6..21339ed 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -306,19 +306,9 @@ def interpolatestoptimes(stop_times_df, calendar_selected_trips_df, day): interpolated_df.set_index('index', inplace=True) interpolated_times = interpolated_df[['departure_time_sec_interpolate']] - # default value for final_stop_times - final_stop_times_df = stop_times_df - - # if empty just duplicate departure_time_sec col - if interpolated_times.empty: - departures = final_stop_times_df['departure_time_sec'].copy() - final_stop_times_df['departure_time_sec_interpolate'] = departures - - # if df not empty, override the default final_stop_times with merge result - else: - final_stop_times_df = pd.merge(stop_times_df, interpolated_times, - how='left', left_index=True, - right_index=True, sort=False, copy=False) + final_stop_times_df = pd.merge(stop_times_df, interpolated_times, + how='left', left_index=True, + right_index=True, sort=False, copy=False) # fill in nulls in interpolated departure time column using trips that did not need interpolation in order to create # one column with both original and interpolated times From cd254626e396ebac56a165f9093f45e5bb8910a0 Mon Sep 17 00:00:00 2001 From: Kuan Butts Date: Thu, 20 Apr 2017 15:36:52 -0700 Subject: [PATCH 4/4] readd drop_bool = False --- urbanaccess/gtfs/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 21339ed..17e6bf2 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -292,6 +292,7 @@ def interpolatestoptimes(stop_times_df, calendar_selected_trips_df, day): # Need to check if existing index in column names and drop if so (else # a ValueError where Pandas can't insert b/c col already exists will occur) + drop_bool = False if _check_if_index_name_in_cols(df_for_interpolation): # move the current index to own col named 'index' col_name_to_copy = df_for_interpolation.index.name