From e8a2bb0ff13e90fababc01c50f98d0d185d01357 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:39:33 -0700 Subject: [PATCH 01/38] update contribution guidelines --- CONTRIBUTING.md | 68 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b91e6e2..5c082b3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,20 +1,68 @@ +Thanks for using UrbanAccess! + +This is an open source project that's part of the Urban Data Science Toolkit. Development and maintenance is a collaboration between UrbanSim Inc, U.C. Berkeley's Urban Analytics Lab, and other contributors. + +## If you have a problem: + +- Take a look at the [open issues](https://github.com/UDST/urbanaccess/issues) and [closed issues](https://github.com/UDST/urbanaccess/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Open a new issue describing the problem -- if possible, include any error messages, a full reproducible example of the code that generated the error, the operating system and version of Python you're using, and versions of any libraries that may be relevant + +## Feature proposals: + +- Take a look at the [open issues](https://github.com/UDST/urbanaccess/issues) and [closed issues](https://github.com/UDST/urbanaccess/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Post your proposal as a new issue, so we can discuss it (some proposals may not be a good fit for the project) + +## Contributing code: + +- Create a new branch of `UDST/urbanaccess/dev`, or fork the repository to your own account + +- Make your changes, following the existing styles for code and inline documentation + +- Add [tests](https://github.com/UDST/urbanaccess/tree/dev/urbanaccess/tests) if possible + - We use the test suite: Pytest + +- Run tests and address any issues that may be flagged. If flags are raised that are not due to the PR note that in a new comment in the PR + - Run Pytest test suite: `py.test` + - UrbanAccess currently supports Python 2.7, 3.5, 3.6, 3.7, 3.8. Tests will be run in these environments when the PR is created but any flags raised in these environments should also be addressed + - UrbanAccess also uses a series of integration tests to test entire workflows, run the integration tests: + - Run: + ```cd demo + jupyter nbconvert --to python simple_example.ipynb + cd ../urbanaccess/tests/integration + python remove_nb_magic.py -in simple_example.py -out simple_example_clean.py + cd ../../../demo + python simple_example_clean.py + cd ../urbanaccess/tests/integration + python integration_madison.py + python integration_sandiego.py + - Run pycodestyle Python style guide checker: `pycodestyle --max-line-length=100 urbanaccess` + +- Open a pull request to the `UDST/urbanaccess` `dev` branch, including a writeup of your changes -- take a look at some of the closed PR's for examples + +- Current maintainers will review the code, suggest changes, and hopefully merge it and schedule it for an upcoming release + +## Updating the documentation: + +- See instructions in `docs/README.md` + ## Preparing a release: - Make a new branch for release prep -- Update the version number and changelog +- Update the version number and changelog: - `CHANGELOG.md` - `setup.py` - `urbanaccess/__init__.py` - - `docs/source/conf.py` - `docs/source/index.rst` + - `docs/source/conf.py` - Make sure all the tests are passing, and check if updates are needed to `README.md` or to the documentation -- Open a pull request to the master branch to finalize it - -- After merging, tag the release on GitHub and follow the distribution procedures below +- Open a pull request to the `dev` branch to finalize it and wait for a PR review and approval +- After the PR has been approved, it can be merged to `dev`. Then a release PR can be created from `dev` to merge into `master`. Once merged, tag the release on GitHub and follow the distribution procedures below: ## Distributing a release on PyPI (for pip installation): @@ -24,17 +72,21 @@ - Run `python setup.py sdist bdist_wheel --universal` -- This should create a `dist` directory containing two package files -- delete any old ones before the next step +- This should create a `dist` directory containing a gzip package file -- delete any old ones before the next step - Run `twine upload dist/*` -- this will prompt you for your pypi.org credentials -- Check https://pypi.org/project/osmnet/ for the new version +- Check https://pypi.org/project/urbanaccess/ for the new version ## Distributing a release on Conda Forge (for conda installation): -- The [conda-forge/urbanaccess-feedstock](https://github.com/conda-forge/urbanaccess-feedstock) repository controls the Conda Forge release +- The [conda-forge/urbanaccess-feedstock](https://github.com/conda-forge/urbanaccess-feedstock) repository controls the Conda Forge release, including which GitHub users have maintainer status for the repo - Conda Forge bots usually detect new releases on PyPI and set in motion the appropriate feedstock updates, which a current maintainer will need to approve and merge +- Maintainers can add on additional changes before merging the PR, for example to update the requirements or edit the list of maintainers + +- You can also fork the feedstock and open a PR manually. It seems like this must be done from a personal account (not a group account like UDST) so that the bots can be granted permission for automated cleanup + - Check https://anaconda.org/conda-forge/urbanaccess for the new version (may take a few minutes for it to appear) \ No newline at end of file From ed05642c1915a396bab98ad10f84a11ee8835773 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:42:37 -0700 Subject: [PATCH 02/38] minor formatting, updates to docstrings, and prints for clarity --- urbanaccess/utils.py | 77 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index d5f0420..b6e68ac 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -123,18 +123,18 @@ def _get_logger(level=None, name=None, filename=None): def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): """ - Create a empty hdf5 file + Create an empty HDF5 file Parameters ---------- dir : string, optional - directory to save hdf5 file, if None defaults to dir set in + directory to save HDF5 file, if None defaults to dir set in config.settings.data_folder filename : string, optional - name of the hdf5 file to save with .h5 extension, if None defaults + name of the HDF5 file to save with .h5 extension, if None defaults to urbanaccess.h5 overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns @@ -145,35 +145,35 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): dir = config.settings.data_folder else: if not isinstance(dir, str): - raise ValueError('Directory must be a string') + raise ValueError('Directory must be a string.') try: if not os.path.exists(dir): os.makedirs(dir) except Exception: - raise ValueError('Unable to make directory {}'.format(dir)) + raise ValueError('Unable to make directory {}.'.format(dir)) if filename is None: filename = 'urbanaccess.h5' else: if not isinstance(filename, str): - raise ValueError('Filename must be a string') + raise ValueError('Filename must be a string.') hdf5_save_path = '{}/{}'.format(dir, filename) if not filename.endswith('.h5'): - raise ValueError('hdf5 filename extension must be "h5"') + raise ValueError('HDF5 filename extension must be "h5".') if not os.path.exists(hdf5_save_path): store = pd.HDFStore(hdf5_save_path) store.close() - log('New {} hdf5 store created in dir: {}'.format(filename, dir)) + log(' New {} HDF5 store created in dir: {}.'.format(filename, dir)) elif overwrite_hdf5 and os.path.exists(hdf5_save_path): store = pd.HDFStore(hdf5_save_path) store.close() - log('Existing {} hdf5 store in dir: has been overwritten.'.format( - hdf5_save_path)) + log(' Existing {} HDF5 store in dir: {} has been ' + 'overwritten.'.format(filename, dir)) else: - log('Using existing {} hdf5 store.'.format(hdf5_save_path)) + log(' Using existing HDF5 store: {}.'.format(hdf5_save_path)) return hdf5_save_path @@ -181,65 +181,65 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): def df_to_hdf5(data=None, key=None, overwrite_key=False, dir=None, filename=None, overwrite_hdf5=False): """ - Write a pandas dataframe to a table in a hdf5 file + Write a pandas dataframe to a table in a HDF5 file Parameters ---------- data : pandas.DataFrame - pandas dataframe to save to a hdf5 table + pandas dataframe to save to a HDF5 table key : string - name of table to save dataframe as in the hdf5 file + name of table to save dataframe as in the HDF5 file overwrite_key : bool, optional if true any existing table with the specified key name will be overwritten dir : string - directory to save hdf5 file + directory to save HDF5 file filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns ------- None """ - hdf5_save_path = create_hdf5(dir=dir, filename=filename, - overwrite_hdf5=overwrite_hdf5) + hdf5_save_path = create_hdf5( + dir=dir, filename=filename, overwrite_hdf5=overwrite_hdf5) store = pd.HDFStore(hdf5_save_path, mode='r') if not ''.join(['/', key]) in store.keys(): store.close() data.to_hdf(hdf5_save_path, key=key, mode='a', format='table') - log('{} saved in {} hdf5 store.'.format(key, hdf5_save_path)) + log(' DataFrame: {} saved in HDF5 store: {}.'.format( + key, hdf5_save_path)) elif ''.join(['/', key]) in store.keys() and overwrite_key: store.close() data.to_hdf(hdf5_save_path, key=key, mode='a', format='table') - log('Existing {} overwritten in {} hdf5 store.'.format(key, - hdf5_save_path)) + log(' Existing DataFrame: {} overwritten in HDF5 store: {}.'.format( + key, hdf5_save_path)) else: store.close() - log( - 'Key {} already exists in {} hdf5 store. Set to overwrite_key = ' - 'True to replace.'.format( - key, hdf5_save_path)) + log(' Key {} already exists in HDF5 store: {}. ' + 'Set to overwrite_key = True to replace existing ' + 'data in key.'.format(key, hdf5_save_path)) def hdf5_to_df(dir=None, filename=None, key=None): """ - Read data from a hdf5 file to a pandas dataframe + Read data from a HDF5 file to a pandas dataframe Parameters ---------- dir : string - directory of the hdf5 file to read from + directory of the HDF5 file to read from filename : string - name of the hdf5 file with .h5 extension to read from + name of the HDF5 file with .h5 extension to read from key : string - table inside the hdf5 file to return as a pandas dataframe + table inside the HDF5 file to return as a pandas dataframe Returns ------- @@ -249,20 +249,20 @@ def hdf5_to_df(dir=None, filename=None, key=None): dir = config.settings.data_folder else: if not isinstance(dir, str): - raise ValueError('Directory must be a string') + raise ValueError('Directory must be a string.') if filename is None: filename = 'urbanaccess_net.h5' else: if not isinstance(filename, str): - raise ValueError('Filename must be a string') + raise ValueError('Filename must be a string.') hdf5_load_path = '{}/{}'.format(dir, filename) if not filename.endswith('.h5'): - raise ValueError('hdf5 filename extension must be "h5"') + raise ValueError('HDF5 filename extension must be "h5".') if not os.path.exists(hdf5_load_path): - raise ValueError('Unable to find directory or file: {}'.format( + raise ValueError('Unable to find directory or file: {}.'.format( hdf5_load_path)) with pd.HDFStore(hdf5_load_path) as store: @@ -271,10 +271,9 @@ def hdf5_to_df(dir=None, filename=None, key=None): hdf5_load_path, store.keys())) try: df = store[key] - ('Returned {} as dataframe'.format(key)) + log(' Successfully returned: {} as DataFrame.'.format(key)) except Exception: - raise ValueError( - 'Unable to find key: {}. Keys found: {}'.format(key, - store.keys())) + raise ValueError('Unable to find key: {}. Keys found: {}.'.format( + key, store.keys())) return df From 823a80bf81da20e846d8a560387dabf3face692b Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:44:05 -0700 Subject: [PATCH 03/38] replace '{}/{}'.format() -> os.path.join() --- urbanaccess/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index b6e68ac..a886594 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -159,7 +159,7 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): if not isinstance(filename, str): raise ValueError('Filename must be a string.') - hdf5_save_path = '{}/{}'.format(dir, filename) + hdf5_save_path = os.path.join(dir, filename) if not filename.endswith('.h5'): raise ValueError('HDF5 filename extension must be "h5".') @@ -257,7 +257,7 @@ def hdf5_to_df(dir=None, filename=None, key=None): if not isinstance(filename, str): raise ValueError('Filename must be a string.') - hdf5_load_path = '{}/{}'.format(dir, filename) + hdf5_load_path = os.path.join(dir, filename) if not filename.endswith('.h5'): raise ValueError('HDF5 filename extension must be "h5".') From 02a2016706b203dce3011a4c2ca10d535a4ed653 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:44:49 -0700 Subject: [PATCH 04/38] address TODO to simplify read HDF5 store key print --- urbanaccess/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index a886594..7626ea7 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -266,9 +266,7 @@ def hdf5_to_df(dir=None, filename=None, key=None): hdf5_load_path)) with pd.HDFStore(hdf5_load_path) as store: - # TODO: fix print statement to only display current key, not all keys - log('Successfully read store: {} with the following keys: {}'.format( - hdf5_load_path, store.keys())) + log(' Reading HDF5 store: {}...'.format(hdf5_load_path)) try: df = store[key] log(' Successfully returned: {} as DataFrame.'.format(key)) From e0e86b02ef5491b7c982de9fd6d7ad5ffb0b477c Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:46:24 -0700 Subject: [PATCH 05/38] add prints for saving and loading HDF5 files and minor updates to prints and docstrings --- urbanaccess/network.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/urbanaccess/network.py b/urbanaccess/network.py index 849afdc..b1089bc 100644 --- a/urbanaccess/network.py +++ b/urbanaccess/network.py @@ -1,4 +1,5 @@ import time +import os import geopy from geopy import distance @@ -483,27 +484,28 @@ def save_network(urbanaccess_network, filename, overwrite_key=False, overwrite_hdf5=False): """ Write a urbanaccess_network integrated nodes and edges to a node and edge - table in a hdf5 file + table in a HDF5 file Parameters ---------- urbanaccess_network : object urbanaccess_network object with net_edges and net_nodes DataFrames filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension dir : string, optional - directory to save hdf5 file + directory to save HDF5 file overwrite_key : bool, optional if true any existing table with the specified key name will be overwritten overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns ------- None """ + log('Writing HDF5 store...') if urbanaccess_network is None or urbanaccess_network.net_edges.empty or \ urbanaccess_network.net_nodes.empty: raise ValueError('Either no urbanaccess_network specified or ' @@ -515,19 +517,21 @@ def save_network(urbanaccess_network, filename, df_to_hdf5(data=urbanaccess_network.net_nodes, key='nodes', overwrite_key=overwrite_key, dir=dir, filename=filename, overwrite_hdf5=overwrite_hdf5) + log("Saved HDF5 store: {} with tables: ['net_edges', 'net_nodes'].".format( + os.path.join(dir, filename))) def load_network(dir=config.settings.data_folder, filename=None): """ - Read an integrated network node and edge data from a hdf5 file to - a urbanaccess_network object + Read an integrated network node and edge data from a HDF5 file to + an urbanaccess_network object Parameters ---------- dir : string, optional - directory to read hdf5 file + directory to read HDF5 file filename : string - name of the hdf5 file to read with .h5 extension + name of the HDF5 file to read with .h5 extension Returns ------- @@ -536,7 +540,10 @@ def load_network(dir=config.settings.data_folder, filename=None): ua_network.net_edges : object ua_network.net_nodes : object """ + log('Loading HDF5 store...') ua_network.net_edges = hdf5_to_df(dir=dir, filename=filename, key='edges') ua_network.net_nodes = hdf5_to_df(dir=dir, filename=filename, key='nodes') + log("Read HDF5 store: {} tables: ['net_edges', 'net_nodes'].".format( + os.path.join(dir, filename))) return ua_network From 295cdc958c0cd40fba125f8032f0d7b94c5dc1c0 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:48:57 -0700 Subject: [PATCH 06/38] address YAMLLoadWarning by replacing yaml.load(f) -> yaml.safe_load(f) --- urbanaccess/config.py | 2 +- urbanaccess/gtfsfeeds.py | 2 +- urbanaccess/tests/test_gtfsfeeds.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/urbanaccess/config.py b/urbanaccess/config.py index cba8f25..273030c 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -108,7 +108,7 @@ def from_yaml(cls, configdir='configs', yaml_file = os.path.join(configdir, yamlname) with open(yaml_file, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) settings = cls(data_folder=yaml_config.get('data_folder', 'data'), logs_folder=yaml_config.get('logs_folder', 'logs'), diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 26acb02..92b4062 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -63,7 +63,7 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, yaml_file = os.path.join(gtfsfeeddir, yamlname) with open(yaml_file, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) if not isinstance(yaml_config, dict): raise ValueError('{} yamlname is not a dict'.format(yamlname)) diff --git a/urbanaccess/tests/test_gtfsfeeds.py b/urbanaccess/tests/test_gtfsfeeds.py index dacf5be..7dc7858 100644 --- a/urbanaccess/tests/test_gtfsfeeds.py +++ b/urbanaccess/tests/test_gtfsfeeds.py @@ -87,7 +87,7 @@ def test_to_yaml_feed(tmpdir, feed_dict3): yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml') with open(yaml_path, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) assert yaml_config['gtfs_feeds'] == feed_dict3 # clear feeds from global memory feeds.remove_feed(remove_all=True) From 56d66023b72ad9b698010f06b36fafbada8cdf49 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 11:50:13 -0700 Subject: [PATCH 07/38] ensure lists and DFs are returned in correct sort order for unit tests --- urbanaccess/tests/test_gtfs_load.py | 21 ++++++++++----------- urbanaccess/tests/test_gtfs_utils_format.py | 6 ++++++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/urbanaccess/tests/test_gtfs_load.py b/urbanaccess/tests/test_gtfs_load.py index f813f27..4733376 100644 --- a/urbanaccess/tests/test_gtfs_load.py +++ b/urbanaccess/tests/test_gtfs_load.py @@ -15,7 +15,7 @@ def expected_urbanaccess_gtfs_df_keys(): expected_keys = ['stops', 'routes', 'trips', 'stop_times', 'calendar', 'calendar_dates', 'stop_times_int', 'headways'] - return expected_keys.sort() + return sorted(expected_keys) @pytest.fixture @@ -119,8 +119,8 @@ def test_loadgtfsfeed_to_df_wo_calendar( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar_dates'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -143,8 +143,8 @@ def test_loadgtfsfeed_to_df_wo_calendar_dates( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -167,8 +167,8 @@ def test_loadgtfsfeed_to_df_w_calendar_and_calendar_dates( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar', 'calendar_dates'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -222,10 +222,9 @@ def test_loadgtfsfeed_to_df_wo_agency( append_definitions=False) assert isinstance(loaded_feeds, urbanaccess_gtfs_df) urbanaccess_gtfs_df_info = vars(loaded_feeds) - expected_dfs = ['stops', 'routes', 'trips', 'stop_times', - 'calendar'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar'] + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty diff --git a/urbanaccess/tests/test_gtfs_utils_format.py b/urbanaccess/tests/test_gtfs_utils_format.py index d22d04e..b2e7abd 100644 --- a/urbanaccess/tests/test_gtfs_utils_format.py +++ b/urbanaccess/tests/test_gtfs_utils_format.py @@ -924,6 +924,9 @@ def test_remove_whitespace_from_values(trips_feed_w_invalid_values): df=raw_df, textfile='trips.txt', col_list=['trip_id', 'service_id', 'route_id']) + # re-sort cols so they are in same order for test + expected_df.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) assert result.equals(expected_df) # test when no col_list is used @@ -942,4 +945,7 @@ def test_read_gtfs_trips_w_invalid_values(trips_feed_w_invalid_values): raw_df, expected_df, feed_path = trips_feed_w_invalid_values result = utils_format._read_gtfs_trips( textfile_path=feed_path, textfile='trips.txt') + # re-sort cols so they are in same order for test + expected_df.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) assert result.equals(expected_df) From 8201d7df6f48be4bb1881a9c3d2cf89c0046ff92 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:27:03 -0700 Subject: [PATCH 08/38] minor formatting, print updates for simplification, and docstring updates --- urbanaccess/gtfs/network.py | 364 ++++++++++++++++-------------------- urbanaccess/osm/load.py | 2 +- 2 files changed, 167 insertions(+), 199 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 306958b..fdbea57 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -11,14 +11,16 @@ pd.options.mode.chained_assignment = None -def create_transit_net(gtfsfeeds_dfs, day, - timerange, - calendar_dates_lookup=None, - overwrite_existing_stop_times_int=False, - use_existing_stop_times_int=False, - save_processed_gtfs=False, - save_dir=config.settings.data_folder, - save_filename=None): +def create_transit_net( + gtfsfeeds_dfs, + day, + timerange, + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=config.settings.data_folder, + save_filename=None): """ Create a travel time weight network graph in units of minutes from GTFS data @@ -26,16 +28,16 @@ def create_transit_net(gtfsfeeds_dfs, day, Parameters ---------- gtfsfeeds_dfs : object - gtfsfeeds_dfs object with DataFrames of stops, routes, trips, + urbanaccess_gtfs_df object with DataFrames of stops, routes, trips, stop_times, calendar, calendar_dates (optional) and stop_times_int (optional) - day : {'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', - 'wednesday'} + day : {'monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'} day of the week to extract transit schedule from that corresponds to the day in the GTFS calendar timerange : list time range to extract transit schedule from in a list with time - 1 and time 2. it is suggested the time range + 1 and time 2 as strings. It is suggested the time range specified is large enough to allow for travel from one end of the transit network to the other but small enough to represent a relevant travel time period such as a 3 hour window @@ -59,11 +61,11 @@ def create_transit_net(gtfsfeeds_dfs, day, gtfsfeeds_dfs object it will be used instead of re-calculated save_processed_gtfs : bool, optional if true, all processed GTFS DataFrames will - be stored to disk in a hdf5 file + be stored to disk in a HDF5 file save_dir : str, optional - directory to save the hdf5 file + directory to save the HDF5 file save_filename : str, optional - name to save the hdf5 file as + name to save the HDF5 file as Returns ------- @@ -107,9 +109,9 @@ def create_transit_net(gtfsfeeds_dfs, day, error_msg_case_2 = 'calendar or calendar_dates' raise ValueError(error_msg.format(error_msg_case_2)) if not isinstance(overwrite_existing_stop_times_int, bool): - raise ValueError('overwrite_existing_stop_times_int must be bool') + raise ValueError('overwrite_existing_stop_times_int must be bool.') if not isinstance(use_existing_stop_times_int, bool): - raise ValueError('use_existing_stop_times_int must be bool') + raise ValueError('use_existing_stop_times_int must be bool.') if not isinstance(save_processed_gtfs, bool): raise ValueError('save_processed_gtfs must be bool') @@ -133,7 +135,7 @@ def create_transit_net(gtfsfeeds_dfs, day, calendar_dates_lookup=calendar_dates_lookup) if gtfsfeeds_dfs.stop_times_int.empty or \ - overwrite_existing_stop_times_int or use_existing_stop_times_int\ + overwrite_existing_stop_times_int or use_existing_stop_times_int \ is False: gtfsfeeds_dfs.stop_times_int = _interpolate_stop_times( stop_times_df=gtfsfeeds_dfs.stop_times, @@ -158,17 +160,12 @@ def create_transit_net(gtfsfeeds_dfs, day, endtime=timerange[1]) final_edge_table = _format_transit_net_edge( - stop_times_df=selected_interpolated_stop_times_df[['unique_trip_id', - 'stop_id', - 'unique_stop_id', - 'timediff', - 'stop_sequence', - 'unique_agency_id', - 'trip_id']]) - - transit_edges = _convert_imp_time_units(df=final_edge_table, - time_col='weight', - convert_to='minutes') + stop_times_df=selected_interpolated_stop_times_df[ + ['unique_trip_id', 'stop_id', 'unique_stop_id', 'timediff', + 'stop_sequence', 'unique_agency_id', 'trip_id']]) + + transit_edges = _convert_imp_time_units( + df=final_edge_table, time_col='weight', convert_to='minutes') final_selected_stops = _stops_in_edge_table_selector( input_stops_df=gtfsfeeds_dfs.stops, @@ -176,11 +173,11 @@ def create_transit_net(gtfsfeeds_dfs, day, transit_nodes = _format_transit_net_nodes(df=final_selected_stops) - transit_edges = _route_type_to_edge(transit_edge_df=transit_edges, - stop_time_df=gtfsfeeds_dfs.stop_times) + transit_edges = _route_type_to_edge( + transit_edge_df=transit_edges, stop_time_df=gtfsfeeds_dfs.stop_times) - transit_edges = _route_id_to_edge(transit_edge_df=transit_edges, - trips_df=gtfsfeeds_dfs.trips) + transit_edges = _route_id_to_edge( + transit_edge_df=transit_edges, trips_df=gtfsfeeds_dfs.trips) # assign node and edge net type transit_nodes['net_type'] = 'transit' @@ -190,7 +187,7 @@ def create_transit_net(gtfsfeeds_dfs, day, ua_network.transit_edges = transit_edges ua_network.transit_nodes = transit_nodes - log('Successfully created transit network. Took {:,.2f} seconds'.format( + log('Successfully created transit network. Took {:,.2f} seconds.'.format( time.time() - start_time)) return ua_network @@ -210,8 +207,8 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, calendar DataFrame input_calendar_dates_df : pandas.DataFrame calendar_dates DataFrame - day : {'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', - 'wednesday'} + day : {'monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'} day of the week to extract transit schedule that corresponds to the day in the GTFS calendar calendar_dates_lookup : dict, optional @@ -230,37 +227,39 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, """ start_time = time.time() - valid_days = ['friday', 'monday', 'saturday', 'sunday', - 'thursday', 'tuesday', 'wednesday'] + valid_days = ['monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'] if day not in valid_days: + valid_days_str = str(valid_days).replace('[', '').replace(']', '') raise ValueError('Incorrect day specified. Must be one of lowercase ' - 'strings: friday, monday, saturday, sunday, ' - 'thursday, tuesday, wednesday.') + 'strings: {}.'.format(valid_days_str)) # check format of calendar_dates_lookup if calendar_dates_lookup is not None: if not isinstance(calendar_dates_lookup, dict): - raise ValueError('calendar_dates_lookup parameter is not a dict') + raise ValueError( + 'calendar_dates_lookup parameter must be a dictionary.') for key in calendar_dates_lookup.keys(): if not isinstance(key, str): - raise ValueError('calendar_dates_lookup key {} must be a ' - 'string'.format(key)) + raise ValueError('calendar_dates_lookup key: {} ' + 'must be a string.'.format(key)) if isinstance(calendar_dates_lookup[key], str): value = [calendar_dates_lookup[key]] else: if not isinstance(calendar_dates_lookup[key], list): raise ValueError( - 'calendar_dates_lookup value {} must be a string or a ' - 'list of strings'.format( + 'calendar_dates_lookup value: {} must be a string or ' + 'a list of strings.'.format( calendar_dates_lookup[key])) else: value = calendar_dates_lookup[key] for string in value: if not isinstance(string, str): - raise ValueError('{} must be a string'.format(value)) + raise ValueError('calendar_dates_lookup value: {} ' + 'must contain strings.'.format(value)) # create unique service ids df_list = [input_trips_df, input_calendar_df] @@ -270,27 +269,29 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, for index, df in enumerate(df_list): df['unique_service_id'] = (df['service_id'].str.cat( - df['unique_agency_id'].astype('str'), - sep='_')) + df['unique_agency_id'].astype('str'), sep='_')) df_list[index] = df # select service ids where day specified has a 1 = service runs on that day - log('Using calendar to extract service_ids to select trips.') + log('Using calendar to extract service_ids to select trips...') input_calendar_df = input_calendar_df[(input_calendar_df[day] == 1)] input_calendar_df = input_calendar_df[['unique_service_id']] num_cal_service_ids_extracted = len(input_calendar_df) - log('{:,} service_ids were extracted from calendar'.format( + log('{:,} service_ids were extracted from calendar.'.format( num_cal_service_ids_extracted)) # generate information needed to tell user the status of their trips in # terms of service_ids in calendar and calendar_dates tables - trips_in_calendar = input_trips_df.loc[input_trips_df[ - 'unique_service_id'].isin( - input_calendar_df['unique_service_id'])] - trips_notin_calendar = input_trips_df.loc[~input_trips_df[ - 'unique_service_id'].isin(input_calendar_df['unique_service_id'])] + trips_in_calendar = input_trips_df.loc[ + input_trips_df['unique_service_id'].isin( + input_calendar_df['unique_service_id'])] + trips_notin_calendar = input_trips_df.loc[ + ~input_trips_df['unique_service_id'].isin( + input_calendar_df['unique_service_id'])] - pct_trips_in_calendar = round(len(trips_in_calendar) / len( + cnt_input_trips_df = len(input_trips_df) + cnt_trips_in_calendar = len(trips_in_calendar) + pct_trips_in_calendar = round(cnt_trips_in_calendar / len( input_trips_df) * 100, 2) feeds_wtrips_in_cal = trips_in_calendar['unique_feed_id'].unique() @@ -308,11 +309,10 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, feed_id_not_in_cal = [x for x in feeds_wotrips_in_cal if x not in feeds_wtrips_in_cal] for feed_id in feed_id_not_in_cal: - log( - '0 trip(s) 0 percent of {:,} total trip records were ' - 'found in calendar for GTFS feed: {}'.format( - len(input_trips_df), - ' '.join(feed_id.split('_')[:-1]))) + trip_feed_name = ' '.join(feed_id.split('_')[:-1]) + log('0 trip(s) 0 percent of {:,} total trip records were ' + 'found in calendar for GTFS feed: {}.'.format( + cnt_input_trips_df, trip_feed_name)) if len(trips_notin_calendar) > 0 and calendar_dates_lookup is None: warning_msg = ( @@ -325,46 +325,41 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, 'in doubt do not use the calendar_dates_lookup parameter.') log(warning_msg, level=lg.WARNING) - if len(feeds_wtrips_in_cal) != len( - feeds_wotrips_in_cal) and calendar_dates_lookup is None: + if len(feeds_wtrips_in_cal) != len(feeds_wotrips_in_cal) and \ + calendar_dates_lookup is None: for feed_id in feeds_wotrips_in_cal: - log( - '{:,} trip(s) {:.2f} percent of {:,} total trip records were ' - 'not found in calendar for GTFS feed: {}'.format( - len(trips_in_calendar), - pct_trips_in_calendar, - len(input_trips_df), - ' '.join(feed_id.split('_')[:-1]))) + trip_feed_name = ' '.join(feed_id.split('_')[:-1]) + log('{:,} trip(s) {:.2f} percent of {:,} total trip records were ' + 'not found in calendar for GTFS feed: {}.'.format( + cnt_trips_in_calendar, pct_trips_in_calendar, + cnt_input_trips_df, trip_feed_name)) if feed_id not in feeds_wtrips_in_cal: log('Warning: GTFS feed: {} no trips were selected using ' 'calendar. It is suggested you use the ' - 'calendar_dates_lookup parameter to utilize this feeds ' - 'calendar_dates file.'.format( - ' '.join(feed_id.split('_')[:-1])), + 'calendar_dates_lookup parameter to utilize this feed\'s ' + 'calendar_dates file.'.format(trip_feed_name), level=lg.WARNING) # look for service_ids inside of calendar_dates if calendar does not # supply enough service_ids to select trips by if len(trips_notin_calendar) > 0 and calendar_dates_lookup is not None: - log('Using calendar_dates to supplement service_ids extracted from ' - 'calendar to select trips.') + 'calendar to select trips...') subset_result_df = pd.DataFrame() if input_calendar_dates_df.empty: - raise ValueError( - 'calendar_dates_df is empty. Unable to use the ' - 'calendar_dates_lookup parameter') + raise ValueError('calendar_dates_df is empty. Unable to use the ' + 'calendar_dates_lookup parameter.') for col_name_key, string_value in calendar_dates_lookup.items(): if col_name_key not in input_calendar_dates_df.columns: - raise ValueError('{} column not found in calendar_dates ' - 'dataframe'.format(col_name_key)) + raise ValueError('Column: {} not found in calendar_dates ' + 'dataframe.'.format(col_name_key)) if col_name_key not in input_calendar_dates_df.select_dtypes( include=[object]).columns: - raise ValueError('{} column is not object type'.format( + raise ValueError('Column: {} must be object type.'.format( col_name_key)) if not isinstance(string_value, list): @@ -372,21 +367,19 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, for text in string_value: # TODO: modify this in order to allow subset based on gtfs - # feed name or a or/and condition + # feed name or a or/and condition subset_result = input_calendar_dates_df[ input_calendar_dates_df[col_name_key].str.match( text, case=False, na=False)] - if len(subset_result) != 0: + cnt_subset_result = len(subset_result) + if cnt_subset_result != 0: feed_id_list = subset_result['unique_feed_id'].unique() for index, id in enumerate(feed_id_list): feed_id_list[index] = ' '.join(id.split('_')[:-1]) - log('Found {:,} records that matched query: column: {} ' - 'and string: {} for GTFS feed(s): {}'.format(len( - subset_result), - col_name_key, - text, - feed_id_list)) + log('Found {:,} record(s) that matched query: column: {} ' + 'and string: {} for GTFS feed(s): {}.'.format( + cnt_subset_result, col_name_key, text, feed_id_list)) subset_result_df = subset_result_df.append(subset_result) @@ -394,11 +387,11 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, subset_result_df = subset_result_df[['unique_service_id']] num_caldates_service_ids_extracted = len(subset_result_df) - log('An additional {:,} service_ids were extracted from ' - 'calendar_dates. Total service_ids extracted: {:,}'.format( - num_caldates_service_ids_extracted, - num_caldates_service_ids_extracted + - num_cal_service_ids_extracted)) + tot_service_ids_extracted = \ + num_caldates_service_ids_extracted + num_cal_service_ids_extracted + log('An additional {:,} service_id(s) were extracted from ' + 'calendar_dates. Total service_id(s) extracted: {:,}.'.format( + num_caldates_service_ids_extracted, tot_service_ids_extracted)) input_calendar_df = input_calendar_df.append(subset_result_df) input_calendar_df.drop_duplicates(inplace=True) @@ -416,22 +409,18 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, calendar_selected_trips_df.reset_index(drop=True, inplace=True) calendar_selected_trips_df.drop('unique_service_id', axis=1, inplace=True) + calendar_selected_trips_count = len(calendar_selected_trips_df) if calendar_dates_lookup is None: log('{:,} of {:,} total trips were extracted representing calendar ' - 'day: {}. Took {:,.2f} seconds'.format(len( - calendar_selected_trips_df), - len(input_trips_df), - day, - time.time() - start_time)) + 'day: {}. Took {:,.2f} seconds.'.format( + calendar_selected_trips_count, cnt_input_trips_df, day, + time.time() - start_time)) else: log('{:,} of {:,} total trips were extracted representing calendar ' - 'day: {} and calendar_dates search parameters: {}. Took {:,' - '.2f} seconds'.format(len( - calendar_selected_trips_df), - len(input_trips_df), - day, - calendar_dates_lookup, - time.time() - start_time)) + 'day: {} and calendar_dates search parameters: {}. ' + 'Took {:,.2f} seconds.'.format( + calendar_selected_trips_count, cnt_input_trips_df, day, + calendar_dates_lookup, time.time() - start_time)) return calendar_selected_trips_df @@ -461,20 +450,18 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): for index, df in enumerate(df_list): df['unique_trip_id'] = (df['trip_id'].str.cat( - df['unique_agency_id'].astype('str'), - sep='_')) + df['unique_agency_id'].astype('str'), sep='_')) df_list[index] = df # sort stop times inplace based on first to last stop in # sequence -- required as the linear interpolator runs # from first value to last value if stop_times_df['stop_sequence'].isnull().sum() > 1: - log('WARNING: There are {:,} ' - 'stop_sequence records missing in the stop_times DataFrame. ' - 'Please check these missing values. In order for interpolation ' - 'to proceed correctly, ' - 'all records must have a stop_sequence value.'.format( - stop_times_df['stop_sequence'].isnull().sum()), + log('WARNING: There are {:,} stop_sequence records missing in the ' + 'stop_times DataFrame. Please check these missing values. ' + 'In order for interpolation to proceed correctly, all records ' + 'must have a stop_sequence value.'.format( + stop_times_df['stop_sequence'].isnull().sum()), level=lg.WARNING) stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], @@ -503,22 +490,20 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): log('Note: Processing may take a long time depending' ' on the number of records. ' - 'Total unique trips to assess: {:,}'.format( - len(stop_times_df['unique_trip_id'].unique())), + 'Total unique trips to assess: {:,}.'.format( + len(stop_times_df['unique_trip_id'].unique())), level=lg.WARNING) log('Starting departure stop time interpolation...') - log( - 'Departure time records missing from trips following the ' + log('Departure time records missing from trips following the ' 'specified schedule: {:,} ({:.2f} percent of {:,} total ' - 'records)'.format( - missing_stop_times_count, - (missing_stop_times_count / len(stop_times_df)) * 100, - len(stop_times_df['departure_time_sec']))) + 'records.)'.format( + missing_stop_times_count, + (missing_stop_times_count / len(stop_times_df)) * 100, + len(stop_times_df['departure_time_sec']))) log('Interpolating...') else: - log('There are no departure time records missing from trips ' 'following the specified schedule. There are no records to ' 'interpolate.') @@ -539,13 +524,13 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): # Pivot to DataFrame where each unique trip has its own column # Index is stop_sequence - pivot = df_for_interpolation.pivot(index='stop_sequence', - columns='unique_trip_id', - values='departure_time_sec') + pivot = df_for_interpolation.pivot( + index='stop_sequence', columns='unique_trip_id', + values='departure_time_sec') # Interpolate on the whole DataFrame at once - interpolator = pivot.interpolate(method='linear', axis=0, - limit_direction='forward') + interpolator = pivot.interpolate( + method='linear', axis=0, limit_direction='forward') # Melt back into stacked format interpolator['stop_sequence_merge'] = interpolator.index @@ -559,10 +544,10 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): lambda col: col.last_valid_index(), axis=0) last_valid_stop_df = last_valid_stop_series.to_frame('last_valid_stop') - df_for_interpolation = (df_for_interpolation - .merge(last_valid_stop_df, - left_on='unique_trip_id', - right_index=True)) + df_for_interpolation = ( + df_for_interpolation.merge( + last_valid_stop_df, left_on='unique_trip_id', + right_index=True)) trailing = (df_for_interpolation.stop_sequence > df_for_interpolation.last_valid_stop) @@ -571,8 +556,8 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): df_for_interpolation['stop_sequence_merge'] = ( df_for_interpolation[~trailing]['stop_sequence']) - # Need to check if existing index in column names and drop if so (else - # a ValueError where Pandas can't insert + # Need to check if existing index is in column names and drop if + # so (else a ValueError where Pandas can't insert # b/c col already exists will occur) drop_bool = False if _check_if_index_name_in_cols(df_for_interpolation): @@ -591,10 +576,9 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): interpolated_times = ( interpolated_df[['departure_time_sec_interpolate']]) - final_stop_times_df = pd.merge(stop_times_df, interpolated_times, - how='left', left_index=True, - right_index=True, sort=False, - copy=False) + final_stop_times_df = pd.merge( + stop_times_df, interpolated_times, how='left', + left_index=True, right_index=True, sort=False, copy=False) else: final_stop_times_df = stop_times_df @@ -631,10 +615,8 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): final_stop_times_df['unique_agency_id'].astype('str'), sep='_')) if missing_stop_times_count > 0: - log( - 'Departure stop time interpolation complete. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Departure stop time interpolation complete. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return final_stop_times_df @@ -659,10 +641,8 @@ def _time_difference(stop_times_df): # calculate difference between consecutive records grouping by trip id. stop_times_df['timediff'] = stop_times_df.groupby('unique_trip_id')[ 'departure_time_sec_interpolate'].diff() - log( - 'Difference between stop times has been successfully calculated. ' - 'Took {:,.2f} seconds'.format( - time.time() - start_time)) + log('Difference between stop times has been successfully calculated. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return stop_times_df @@ -706,15 +686,16 @@ def _time_selector(df, starttime, endtime): # create df of stops times that are within the requested range selected_stop_timesdf = df[( - (starttime_sec < df["departure_time_sec_interpolate"]) & ( - df["departure_time_sec_interpolate"] < endtime_sec))] - - log( - 'Stop times from {} to {} successfully selected {:,} records out of ' - '{:,} total records ({:.2f} percent of total). Took {:,' - '.2f} seconds'.format( - starttime, endtime, len(selected_stop_timesdf), len(df), - (len(selected_stop_timesdf) / len(df)) * 100, + (starttime_sec < df["departure_time_sec_interpolate"]) & ( + df["departure_time_sec_interpolate"] < endtime_sec))] + + subset_df_count = len(selected_stop_timesdf) + df_count = len(df) + log('Stop times from {} to {} successfully selected {:,} records out of ' + '{:,} total records ({:.2f} percent of total). ' + 'Took {:,.2f} seconds.'.format( + starttime, endtime, subset_df_count, df_count, + (subset_df_count / df_count) * 100, time.time() - start_time)) return selected_stop_timesdf @@ -741,7 +722,7 @@ def _format_transit_net_edge(stop_times_df): log('Starting transformation process for {:,} ' 'total trips...'.format(len(stop_times_df['unique_trip_id'].unique()))) - # set columns for new df for data needed by pandana for edges + # set columns for new df for data needed by Pandana for edges merged_edge = [] stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], @@ -752,8 +733,8 @@ def _format_transit_net_edge(stop_times_df): "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, "weight": tmp_trip_df['timediff'].iloc[1:].values, - "unique_agency_id": tmp_trip_df['unique_agency_id'].iloc[ - 1:].values, + "unique_agency_id": tmp_trip_df[ + 'unique_agency_id'].iloc[1:].values, # set unique trip id without edge order to join other data later "unique_trip_id": trip }) @@ -766,15 +747,14 @@ def _format_transit_net_edge(stop_times_df): merged_edge.append(edge_df) merged_edge_df = pd.concat(merged_edge, ignore_index=True) - merged_edge_df['sequence'] = merged_edge_df['sequence'].astype(int, - copy=False) + merged_edge_df['sequence'] = merged_edge_df['sequence'].astype( + int, copy=False) merged_edge_df['id'] = ( merged_edge_df['unique_trip_id'].str.cat( merged_edge_df['sequence'].astype('str'), sep='_')) - log('stop time table transformation to ' - 'Pandana format edge table completed. ' - 'Took {:,.2f} seconds'.format(time.time() - start_time)) + log('Stop time table transformation to Pandana format edge table ' + 'completed. Took {:,.2f} seconds.'.format(time.time() - start_time)) return merged_edge_df @@ -799,7 +779,7 @@ def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'): """ valid_convert_to = ['seconds', 'minutes'] if convert_to not in valid_convert_to or not isinstance(convert_to, str): - raise ValueError('{} not a valid value or not a string'.format( + raise ValueError('{} is not a valid value or is not a string.'.format( convert_to)) if convert_to == 'seconds': @@ -815,8 +795,7 @@ def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'): return df -def _stops_in_edge_table_selector(input_stops_df, - input_stop_times_df): +def _stops_in_edge_table_selector(input_stops_df, input_stop_times_df): """ Select stops that are active during the day and time period specified @@ -845,11 +824,10 @@ def _stops_in_edge_table_selector(input_stops_df, input_stops_df['unique_stop_id'].isin( input_stop_times_df['unique_stop_id'])] - log( - '{:,} of {:,} records selected from stops. Took {:,' - '.2f} seconds'.format( - len(selected_stops_df), len(input_stops_df), - time.time() - start_time)) + log('{:,} of {:,} records selected from stops. ' + 'Took {:,.2f} seconds.'.format( + len(selected_stops_df), len(input_stops_df), + time.time() - start_time)) return selected_stops_df @@ -894,10 +872,8 @@ def _format_transit_net_nodes(df): # set node index to be unique stop id final_node_df = final_node_df.set_index('node_id') - log( - 'stop time table transformation to Pandana format node table ' - 'completed. Took {:,.2f} seconds'.format( - time.time() - start_time)) + log('Stop time table transformation to Pandana format node table ' + 'completed. Took {:,.2f} seconds.'.format(time.time() - start_time)) return final_node_df @@ -926,24 +902,20 @@ def _route_type_to_edge(transit_edge_df, stop_time_df): stop_time_df['unique_agency_id'].astype('str'), sep='_')) # join route_id to the edge table - merged_df = pd.merge(transit_edge_df, - stop_time_df[['unique_trip_id', 'route_type']], - how='left', on='unique_trip_id', sort=False, - copy=False) - merged_df.drop_duplicates(subset='unique_trip_id', - keep='first', - inplace=True) + merged_df = pd.merge( + transit_edge_df, stop_time_df[['unique_trip_id', 'route_type']], + how='left', on='unique_trip_id', sort=False, copy=False) + merged_df.drop_duplicates( + subset='unique_trip_id', keep='first', inplace=True) # need to get unique records here to have a one to one join - # this serves as the look up table # join the look up table created above to the table of interest - transit_edge_df_w_routetype = pd.merge(transit_edge_df, merged_df[ - ['route_type', 'unique_trip_id']], how='left', on='unique_trip_id', - sort=False, copy=False) + transit_edge_df_w_routetype = pd.merge( + transit_edge_df, merged_df[['route_type', 'unique_trip_id']], + how='left', on='unique_trip_id', sort=False, copy=False) - log( - 'route type successfully joined to transit edges. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Route type successfully joined to transit edges. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return transit_edge_df_w_routetype @@ -975,16 +947,12 @@ def _route_id_to_edge(transit_edge_df, trips_df): trips_df['route_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) - transit_edge_df_with_routes = pd.merge(transit_edge_df, trips_df[ - ['unique_trip_id', 'unique_route_id']], - how='left', - on='unique_trip_id', sort=False, - copy=False) + transit_edge_df_with_routes = pd.merge( + transit_edge_df, trips_df[['unique_trip_id', 'unique_route_id']], + how='left', on='unique_trip_id', sort=False, copy=False) - log( - 'route id successfully joined to transit edges. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Route id successfully joined to transit edges. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return transit_edge_df_with_routes diff --git a/urbanaccess/osm/load.py b/urbanaccess/osm/load.py index 567b1d6..261b92c 100644 --- a/urbanaccess/osm/load.py +++ b/urbanaccess/osm/load.py @@ -82,7 +82,7 @@ def ua_network_from_bbox(lat_min=None, lng_min=None, lat_max=None, # remove low connectivity nodes and return cleaned nodes and edges if remove_lcn: - log('checking for low connectivity nodes...') + log('Checking for low connectivity nodes...') pandana_net = Network(nodes['x'], nodes['y'], edges['from'], edges['to'], edges[['distance']]) lcn = pandana_net.low_connectivity_nodes(impedance=10000, count=10, From c436e5a20201bf9bc8b113841fc6e6e638ac1d4d Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:47:50 -0700 Subject: [PATCH 09/38] minor formatting, prints, and docstring updates --- urbanaccess/gtfs/headways.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/urbanaccess/gtfs/headways.py b/urbanaccess/gtfs/headways.py index 0b62a0a..3b9bd6f 100644 --- a/urbanaccess/gtfs/headways.py +++ b/urbanaccess/gtfs/headways.py @@ -68,21 +68,20 @@ def _headway_handler(interpolated_stop_times_df, trips_df, Parameters ---------- interpolated_stop_times_df : pandas.DataFrame - interpolated stop times dataframe for stop times within the time range + interpolated stop times DataFrame for stop times within the time range trips_df : pandas.DataFrame - trips dataframe + trips DataFrame routes_df : pandas.DataFrame - routes dataframe + routes DataFrame headway_timerange : list - time range for which to calculate headways between as a - list of time 1 and time 2 where times are 24 hour clock strings - such as: - ['07:00:00', '10:00:00'] + time range for which to calculate headways between in a list with time + 1 and time 2 as strings. Must follow format of a 24 hour clock for + example: 08:00:00 or 17:00:00 Returns ------- headway_by_routestop_df : pandas.DataFrame - dataframe of statistics of route stop headways in units of minutes + DataFrame of statistics of route stop headways in units of minutes with relevant route and stop information """ start_time = time.time() @@ -138,7 +137,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df, headway_by_routestop_df['unique_stop_id'].str.cat( headway_by_routestop_df['unique_route_id'].astype('str'), sep='_')) - log('headway calculation complete. Took {:,.2f} seconds'.format( + log('Headway calculation complete. Took {:,.2f} seconds.'.format( time.time() - start_time)) return headway_by_routestop_df @@ -153,9 +152,9 @@ def headways(gtfsfeeds_df, headway_timerange): gtfsfeeds_df : object gtfsfeeds_dfs object with all processed GTFS data tables headway_timerange : list - time range for which to calculate headways between as a list of - time 1 and time 2 where times are 24 hour clock strings such as: - ['07:00:00', '10:00:00'] + time range for which to calculate headways between in a list with time + 1 and time 2 as strings. Must follow format of a 24 hour clock for + example: 08:00:00 or 17:00:00 Returns ------- @@ -191,12 +190,12 @@ def headways(gtfsfeeds_df, headway_timerange): level=lg.WARNING) if gtfsfeeds_df is None: - raise ValueError('gtfsfeeds_df cannot be None') + raise ValueError('gtfsfeeds_df cannot be None.') if gtfsfeeds_df.stop_times_int.empty or gtfsfeeds_df.trips.empty or \ gtfsfeeds_df.routes.empty: raise ValueError( - 'one of the gtfsfeeds_dfs objects: stop_times_int, trips, ' - 'or routes were found to be empty.') + 'One of the following gtfsfeeds_dfs objects: stop_times_int, ' + 'trips, or routes were found to be empty.') headways_df = _headway_handler( interpolated_stop_times_df=gtfsfeeds_df.stop_times_int, From b87161bf0de0af711552c6791bf8f0d987dc8ef4 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:50:50 -0700 Subject: [PATCH 10/38] move time range value check to its own function _check_time_range_format() with unit test --- urbanaccess/gtfs/headways.py | 27 +------------- urbanaccess/gtfs/network.py | 31 +++------------- urbanaccess/gtfs/utils_validation.py | 37 +++++++++++++++++++ .../tests/test_gtfs_utils_validation.py | 32 ++++++++++++++++ 4 files changed, 77 insertions(+), 50 deletions(-) create mode 100644 urbanaccess/tests/test_gtfs_utils_validation.py diff --git a/urbanaccess/gtfs/headways.py b/urbanaccess/gtfs/headways.py index 3b9bd6f..0db8b08 100644 --- a/urbanaccess/gtfs/headways.py +++ b/urbanaccess/gtfs/headways.py @@ -4,6 +4,7 @@ import logging as lg from urbanaccess.utils import log +from urbanaccess.gtfs.utils_validation import _check_time_range_format from urbanaccess.gtfs.network import _time_selector warnings.simplefilter(action="ignore", category=FutureWarning) @@ -163,31 +164,7 @@ def headways(gtfsfeeds_df, headway_timerange): route stop headways in units of minutes with relevant route and stop information """ - - time_error_statement = ( - '{} starttime and endtime are not in the correct format. ' - 'Format should be a 24 hour clock in following format: 08:00:00 ' - 'or 17:00:00'.format(headway_timerange)) - if not isinstance(headway_timerange, list) or len(headway_timerange) != 2: - raise ValueError('timerange must be a list of length 2') - if headway_timerange[0].split(':')[0] > headway_timerange[1].split(':')[0]: - raise ValueError('starttime is greater than endtime') - - for t in headway_timerange: - if not isinstance(t, str): - raise ValueError(time_error_statement) - if len(t) != 8: - raise ValueError(time_error_statement) - if int(headway_timerange[1].split(':')[0]) - int( - headway_timerange[0].split(':')[0]) > 3: - long_time_range_msg = ( - 'WARNING: Time range passed: {} is a {} hour period. Long periods ' - 'over 3 hours may take a significant amount of time to process.') - log(long_time_range_msg.format(headway_timerange, - int(str( - headway_timerange[1][0:2])) - int( - str(headway_timerange[0][0:2]))), - level=lg.WARNING) + _check_time_range_format(headway_timerange) if gtfsfeeds_df is None: raise ValueError('gtfsfeeds_df cannot be None.') diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index fdbea57..377a3d4 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -4,6 +4,7 @@ import logging as lg from urbanaccess.utils import log, df_to_hdf5, hdf5_to_df +from urbanaccess.gtfs.utils_validation import _check_time_range_format from urbanaccess.network import ua_network from urbanaccess import config from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs @@ -75,31 +76,11 @@ def create_transit_net( """ start_time = time.time() - time_error_statement = ( - '{} starttime and endtime are not in the correct format. ' - 'Format should be a 24 hour clock in the following format: 08:00:00 ' - 'or 17:00:00'.format( - timerange)) - if not isinstance(timerange, list) or len(timerange) != 2: - raise ValueError(time_error_statement) - if timerange[0] > timerange[1]: - raise ValueError(time_error_statement) - for t in timerange: - if not isinstance(t, str): - raise ValueError(time_error_statement) - if len(t) != 8: - raise ValueError(time_error_statement) - if int(str(timerange[1][0:2])) - int(str(timerange[0][0:2])) > 3: - log( - 'WARNING: Time range passed: {} is a {} hour period. Long ' - 'periods over 3 hours may take a significant amount of time to ' - 'process.'.format( - timerange, - int(str(timerange[1][0:2])) - int(str(timerange[0][0:2]))), - level=lg.WARNING) - if gtfsfeeds_dfs is None: - raise ValueError('gtfsfeeds_dfs is None') - error_msg = ('one of the following gtfsfeeds_dfs objects {} were ' + _check_time_range_format(timerange) + if not isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df): + raise ValueError('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + error_msg = ('One of the following gtfsfeeds_dfs objects: {} were ' 'found to be empty.') if gtfsfeeds_dfs.trips.empty or gtfsfeeds_dfs.stop_times.empty or \ gtfsfeeds_dfs.stops.empty: diff --git a/urbanaccess/gtfs/utils_validation.py b/urbanaccess/gtfs/utils_validation.py index b31c4f1..2d4bf2e 100644 --- a/urbanaccess/gtfs/utils_validation.py +++ b/urbanaccess/gtfs/utils_validation.py @@ -175,3 +175,40 @@ def _validate_gtfs(stops_df, feed_folder, _checkcoordinates(df=stops_df, feed_folder=feed_folder) return stops_df + + +def _check_time_range_format(timerange): + """ + Check time range value format for expected schema + + Parameters + ---------- + timerange : list + time range as a list with time 1 and time 2 as strings. + Must follow format of a 24 hour clock for example: + 08:00:00 or 17:00:00 + + Returns + ------- + None + """ + time_error_statement = ( + '{} starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: 08:00:00 ' + 'or 17:00:00.'.format(timerange)) + if not isinstance(timerange, list) or len(timerange) != 2: + raise ValueError(time_error_statement) + if timerange[0] > timerange[1]: + raise ValueError(time_error_statement) + for t in timerange: + if not isinstance(t, str): + raise ValueError(time_error_statement) + if len(t) != 8: + raise ValueError(time_error_statement) + timerange_hr_1 = int(str(timerange[0][0:2])) + timerange_hr_2 = int(str(timerange[1][0:2])) + if timerange_hr_2 - timerange_hr_1 > 3: + log('WARNING: Time range passed: {} is a {} hour period. Long ' + 'periods over 3 hours may take a significant amount of time to ' + 'process.'.format(timerange, timerange_hr_2 - timerange_hr_1), + level=lg.WARNING) diff --git a/urbanaccess/tests/test_gtfs_utils_validation.py b/urbanaccess/tests/test_gtfs_utils_validation.py new file mode 100644 index 0000000..643dd3b --- /dev/null +++ b/urbanaccess/tests/test_gtfs_utils_validation.py @@ -0,0 +1,32 @@ +import pytest + +import urbanaccess.gtfs.utils_validation as utils_validation + +def test_check_time_range_format(): + utils_validation._check_time_range_format(['07:00:00', '10:00:00']) + + +def test_check_time_range_format_invalid_params(): + msg = ('starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: ' + '08:00:00 or 17:00:00.') + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['7:00:0', '10:00:00']) + expected_error = ("['7:00:0', '10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['10:00:00']) + expected_error = ("['10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format('10:00:00') + expected_error = ("10:00:00 {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format([100000, 170000]) + expected_error = ("[100000, 170000] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['10:00:00', '07:00:00']) + expected_error = ("['10:00:00', '07:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) From 597709c2e4f36035c1a6a6a0aa7e36ff68d1c294 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:52:24 -0700 Subject: [PATCH 11/38] dont allow overwrite_existing_stop_times_int and use_existing_stop_times_int to both be True for clarity --- urbanaccess/gtfs/network.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 377a3d4..2fd664c 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -94,7 +94,10 @@ def create_transit_net( if not isinstance(use_existing_stop_times_int, bool): raise ValueError('use_existing_stop_times_int must be bool.') if not isinstance(save_processed_gtfs, bool): - raise ValueError('save_processed_gtfs must be bool') + raise ValueError('save_processed_gtfs must be bool.') + if overwrite_existing_stop_times_int and use_existing_stop_times_int: + raise ValueError('overwrite_existing_stop_times_int and ' + 'use_existing_stop_times_int cannot both be True.') columns = ['route_id', 'direction_id', From 0f08cc52cd242a6404c3468fc4591d94c90785cf Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:54:23 -0700 Subject: [PATCH 12/38] add prints to clarify when overwrite_existing_stop_times_int or use_existing_stop_times_int are used, remove ValueError that can never happen --- urbanaccess/gtfs/network.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 2fd664c..cacc1d0 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -118,9 +118,13 @@ def create_transit_net( day=day, calendar_dates_lookup=calendar_dates_lookup) + # proceed to calc stop_times_int if stop_times_int is already empty, or + # overwrite existing is True, or use existing is False if gtfsfeeds_dfs.stop_times_int.empty or \ overwrite_existing_stop_times_int or use_existing_stop_times_int \ is False: + if overwrite_existing_stop_times_int: + log(' Overwriting existing stop_times_int DataFrame...') gtfsfeeds_dfs.stop_times_int = _interpolate_stop_times( stop_times_df=gtfsfeeds_dfs.stop_times, calendar_selected_trips_df=calendar_selected_trips_df) @@ -133,10 +137,7 @@ def create_transit_net( dir=save_dir, filename=save_filename) if use_existing_stop_times_int: - if gtfsfeeds_dfs.stop_times_int.empty: - raise ValueError('existing stop_times_int is empty. Set ' - 'use_existing_stop_times_int to False to create ' - 'it.') + log(' Using existing stop_times_int DataFrame...') selected_interpolated_stop_times_df = _time_selector( df=gtfsfeeds_dfs.stop_times_int, From 174ba1e65f5e6016906e15c0a88e376f7a5fe633 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:55:51 -0700 Subject: [PATCH 13/38] only print if applicable --- urbanaccess/gtfs/network.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index cacc1d0..89566d7 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -283,13 +283,11 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, print_feed_ids = [' '.join(feed_id.split('_')[:-1]) for feed_id in feeds_wtrips_in_cal] feeds_wotrips_in_cal = trips_notin_calendar['unique_feed_id'].unique() - log( - '{:,} trip(s) {:.2f} percent of {:,} total trip records were ' - 'found in calendar for GTFS feed(s): {}'.format( - len(trips_in_calendar), - pct_trips_in_calendar, - len(input_trips_df), - print_feed_ids)) + if print_feed_ids: + log('{:,} trip(s) {:.2f} percent of {:,} total trip records were ' + 'found in calendar for GTFS feed(s): {}.'.format( + cnt_trips_in_calendar, pct_trips_in_calendar, cnt_input_trips_df, + print_feed_ids)) feed_id_not_in_cal = [x for x in feeds_wotrips_in_cal if x not in feeds_wtrips_in_cal] From b5ced243de8bd63da29c1c96a916507066ba5df5 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 12:58:44 -0700 Subject: [PATCH 14/38] add specific ValueError when interpolator sees duplicate stop_sequence and unique_trip_id records to aid in debugging --- urbanaccess/gtfs/network.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 89566d7..2e1d2c5 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -504,6 +504,17 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): stop_times_df.unique_trip_id.isin(trips_with_more_than_one_null)] if len(df_for_interpolation) > 0: + # check for duplicate stop_sequence and unique_trip_id combination, + # if dups are found this will throw an error during the pivot() + # operation so catch and return to user instead + dup_df = df_for_interpolation[df_for_interpolation.duplicated( + subset=['stop_sequence', 'unique_trip_id'], keep='first')] + if len(dup_df) != 0: + dup_values = list(dup_df['unique_trip_id'].unique()) + raise ValueError('Found duplicate values when values from ' + 'stop_sequence and unique_trip_id are combined. ' + 'Check values in these columns for ' + 'trip_id(s): {}.'.format(dup_values)) # Pivot to DataFrame where each unique trip has its own column # Index is stop_sequence From a97f92b7e82f8129c9b8c1836e5d3469e25ad48c Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 13:00:31 -0700 Subject: [PATCH 15/38] refactor section that uses _check_if_index_name_in_cols() for clarity and fix rare case where index was removed before merge --- urbanaccess/gtfs/network.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 2e1d2c5..690d0e7 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -555,7 +555,11 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): # b/c col already exists will occur) drop_bool = False if _check_if_index_name_in_cols(df_for_interpolation): - # move the current index to own col named 'index' + # move the current index to its own col named 'index' + log('stop_times index name: {} is also a column name. ' + 'Index will be dropped for interpolation and re-created ' + 'afterwards to continue.'.format( + df_for_interpolation.index.name)) col_name_to_copy = df_for_interpolation.index.name col_to_copy = df_for_interpolation[col_name_to_copy].copy() df_for_interpolation['index'] = col_to_copy @@ -563,10 +567,14 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): df_for_interpolation.reset_index(inplace=True, drop=drop_bool) # Merge back into original index - interpolated_df = pd.merge(df_for_interpolation, melted, 'left', - on=['stop_sequence_merge', - 'unique_trip_id']) - interpolated_df.set_index('index', inplace=True) + interpolated_df = pd.merge( + df_for_interpolation, melted, how='left', + on=['stop_sequence_merge', 'unique_trip_id']) + + # set index back to what it was if it was removed above before merge + if drop_bool is False: + interpolated_df.set_index('index', inplace=True) + interpolated_times = ( interpolated_df[['departure_time_sec_interpolate']]) From b84c46f3652b8dc8b697482014504193ea128af8 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 13:01:00 -0700 Subject: [PATCH 16/38] update docstring --- urbanaccess/gtfs/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 690d0e7..fb3039c 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -1198,16 +1198,17 @@ def load_processed_gtfs_data(filename, dir=config.settings.data_folder): def _check_if_index_name_in_cols(df): """ - Check if existing index is in the passed dataframe list of column names + Check if specified Dataframe has an index name that is also a column name Parameters ---------- df : pandas.DataFrame - interpolated stop_time dataframe + Dataframe to check index and columns Returns ------- - iname : tuple + iname : boolean + True if index name is also a column name, else False """ cols = df.columns.values iname = df.index.name From e22988706e3704d0de51da033af0f5dc8cbc623d Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 13:04:07 -0700 Subject: [PATCH 17/38] refactor edge_impedance_by_route_type(): simplify function, update to accept new GTFS modes 11 and 12, validate inputs, update docstring, add prints --- urbanaccess/gtfs/network.py | 213 +++++++++++++++++------------------- 1 file changed, 103 insertions(+), 110 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index fb3039c..be4a3c0 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -959,24 +959,34 @@ def _route_id_to_edge(transit_edge_df, trips_df): return transit_edge_df_with_routes -def edge_impedance_by_route_type(transit_edge_df, - street_level_rail=None, - underground_rail=None, - intercity_rail=None, - bus=None, - ferry=None, - cable_car=None, - gondola=None, - funicular=None): +def edge_impedance_by_route_type( + transit_edge_df, + travel_time_col_name='weight', + street_level_rail=None, + underground_rail=None, + intercity_rail=None, + bus=None, + ferry=None, + cable_car=None, + gondola=None, + funicular=None, + trolleybus=None, + monorail=None +): """ Penalize transit edge travel time based on transit mode type Parameters ---------- transit_edge_df : pandas.DataFrame - transit edge dataframe + transit edge DataFrame + travel_time_col_name : str, optional + name of travel time column to apply multiplier factor, + default column name is 'weight' street_level_rail : float, optional factor between -1 to 1 to multiply against travel time + underground_rail : float, optional + factor between -1 to 1 to multiply against travel time intercity_rail : float, optional factor between -1 to 1 to multiply against travel time bus : float, optional @@ -989,114 +999,97 @@ def edge_impedance_by_route_type(transit_edge_df, factor between -1 to 1 to multiply against travel time funicular : float, optional factor between -1 to 1 to multiply against travel time + trolleybus : float, optional + factor between -1 to 1 to multiply against travel time + monorail : float, optional + factor between -1 to 1 to multiply against travel time Returns ------- - ua_network : object - ua_network.transit_edges : pandas.DataFrame - + transit_edge_df : pandas.DataFrame + Returns transit_edge_df with travel_time_col_name column weighted by + specified coefficients by route type """ - if 'route_type' not in transit_edge_df.columns: - raise ValueError('No route_type column was found in dataframe') + req_cols = [travel_time_col_name, 'route_type'] + if not isinstance(travel_time_col_name, str): + raise ValueError('travel_time_col_name must be a string.') + for col in req_cols: + if col in transit_edge_df.columns: + if not pd.api.types.is_numeric_dtype(transit_edge_df[col]): + raise ValueError('{} must be a number.'.format(col)) + else: + raise ValueError('Column: {} was not found in transit_edge_df ' + 'DataFrame and is required.'.format(col)) # check count of records for each route type - route_type_desc = {0: 'Street Level Rail: Tram Streetcar Light rail', - 1: 'Underground rail: Subway or Metro', - 2: 'Rail: intercity or long-distance ', 3: 'Bus', - 4: 'Ferry', 5: 'Cable Car', - 6: 'Gondola or Suspended cable car', - 7: 'Steep incline: Funicular'} - log('Route type distribution as percentage of transit mode: {:.2f}'.format( - transit_edge_df['route_type'].map(route_type_desc.get).value_counts( - normalize=True, dropna=False) * 100)) - - var_list = [street_level_rail, underground_rail, intercity_rail, bus, - ferry, cable_car, gondola, funicular] - - for var in var_list: - if var is not None: - if not isinstance(var, float): - raise ValueError('One or more variables are not float') - - travel_time_col_name = 'weight' + # route types taken from 'route_type' definition on route.txt GTFS file: + # https://developers.google.com/transit/gtfs/reference#routestxt + route_type_dict = { + 0: {'name': 'Street Level Rail: Tram, Streetcar, or Light rail', + 'multiplier': street_level_rail}, + 1: {'name': 'Underground rail: Subway or Metro', + 'multiplier': underground_rail}, + 2: {'name': 'Rail: intercity or long-distance ', + 'multiplier': intercity_rail}, + 3: {'name': 'Bus', + 'multiplier': bus}, + 4: {'name': 'Ferry', + 'multiplier': ferry}, + 5: {'name': 'Cable tram or car', + 'multiplier': cable_car}, + 6: {'name': 'Aerial lift: Gondola or Suspended cable car', + 'multiplier': gondola}, + 7: {'name': 'Steep incline: Funicular', + 'multiplier': funicular}, + 11: {'name': 'Trolleybus', + 'multiplier': trolleybus}, + 12: {'name': 'Monorail', + 'multiplier': monorail}} + # create the dict to pass to value_counts() + route_type_desc = route_type_dict.copy() + for key, val in route_type_dict.items(): + route_type_desc[key] = val['name'] + + log('Route type distribution as percentage of transit mode:') + summary_stat = transit_edge_df['route_type'].map( + route_type_desc.get).value_counts(normalize=True, dropna=False) * 100 + log(summary_stat) + travel_time_col = transit_edge_df[travel_time_col_name] - if street_level_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 0]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 0] = travel_time_col + ( - travel_time_col * street_level_rail) - log( - 'Adjusted Street Level Rail transit edge impedance based on mode' - ' type penalty coefficient: {}'.format( - street_level_rail)) - if underground_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 1]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 1] = travel_time_col + ( - travel_time_col * underground_rail) - log( - 'Adjusted Underground rail transit edge impedance based on mode ' - 'type penalty coefficient: {}'.format( - underground_rail)) - if intercity_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 2]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 2] = travel_time_col + ( - travel_time_col * intercity_rail) - log( - 'Adjusted Rail transit edge impedance based on mode type penalty ' - 'coefficient: {}'.format( - intercity_rail)) - if bus is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 3]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 3] = travel_time_col + ( - travel_time_col * bus) - log( - 'Adjusted Bus transit edge impedance based on mode type penalty ' - 'coefficient: {}'.format( - bus)) - if ferry is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 4]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 4] = travel_time_col + ( - travel_time_col * ferry) - log( - 'Adjusted Ferry transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - ferry)) - if cable_car is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 5]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 5] = travel_time_col + ( - travel_time_col * cable_car) - log( - 'Adjusted Cable Car transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - cable_car)) - if gondola is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 6]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 6] = travel_time_col + ( - travel_time_col * gondola) - log( - 'Adjusted Gondola or Suspended cable car transit edge impedance ' - 'based on mode type penalty coefficient: {}'.format( - gondola)) - if funicular is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 7]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 7] = travel_time_col + ( - travel_time_col * funicular) - log( - 'Adjusted Funicular transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - funicular)) - - ua_network.transit_edges = transit_edge_df - - log('Transit edge impedance mode type penalty calculation complete') + for route_type, route_vals in route_type_dict.items(): + if route_vals['multiplier'] is not None: + if not isinstance(route_vals['multiplier'], float): + raise ValueError('One or more multiplier variables are not ' + 'float.') + + # warn if multiplier is not within optimal range + if not -1 <= route_vals['multiplier'] <= 1: + log('WARNING: Multiplier value of: {} should be a ' + 'value between -1 and 1.'.format(route_vals['multiplier']), + level=lg.WARNING) + route_type_cnt = len( + transit_edge_df[transit_edge_df['route_type'] == route_type]) + + # warn if route type is not found in DataFrame + if route_type_cnt == 0 and route_vals['multiplier'] is not None: + log('WARNING: Route type: {} with specified multiplier value ' + 'of: {} was not found in the specified edge ' + 'DataFrame.'.format( + route_vals['name'], route_vals['multiplier']), + level=lg.WARNING) + + if route_type_cnt > 0: + transit_edge_df[travel_time_col_name][ + transit_edge_df['route_type'] == route_type] = \ + travel_time_col + ( + travel_time_col * route_vals['multiplier']) + log('Adjusted {} transit edge impedance based on mode ' + 'type penalty coefficient: {}.'.format( + route_vals['name'], route_vals['multiplier'])) + + log('Transit edge impedance mode type penalty calculation complete.') + return transit_edge_df return ua_network From 03227ab379191ae524dad278f5ae1f39555bff67 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 13:07:16 -0700 Subject: [PATCH 18/38] refactor save_processed_gtfs_data(): simplify function, add prints, add calendar and calendar_dates as optional tables --- urbanaccess/gtfs/network.py | 86 +++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index be4a3c0..64c3aa8 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -1,4 +1,5 @@ from __future__ import division +import os import pandas as pd import time import logging as lg @@ -7,7 +8,8 @@ from urbanaccess.gtfs.utils_validation import _check_time_range_format from urbanaccess.network import ua_network from urbanaccess import config -from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs +from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs, \ + urbanaccess_gtfs_df pd.options.mode.chained_assignment = None @@ -1091,62 +1093,64 @@ def edge_impedance_by_route_type( log('Transit edge impedance mode type penalty calculation complete.') return transit_edge_df - return ua_network - -def save_processed_gtfs_data(gtfsfeeds_dfs, - filename, - dir=config.settings.data_folder): +def save_processed_gtfs_data( + gtfsfeeds_dfs, filename, dir=config.settings.data_folder): """ - Write dataframes in a gtfsfeeds_dfs object to a hdf5 file + Write dataframes in an urbanaccess_gtfs_df object to a HDF5 file Parameters ---------- gtfsfeeds_dfs : object - gtfsfeeds_dfs object + urbanaccess_gtfs_df object filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension dir : string, optional - directory to save hdf5 file + directory to save HDF5 file Returns ------- None """ - # TODO: refactor check below to use any() for readability - if gtfsfeeds_dfs is None or gtfsfeeds_dfs.stops.empty or \ - gtfsfeeds_dfs.routes.empty or gtfsfeeds_dfs.trips.empty \ - or gtfsfeeds_dfs.stop_times.empty or \ - gtfsfeeds_dfs.calendar.empty or \ - gtfsfeeds_dfs.stop_times_int.empty: - raise ValueError('gtfsfeeds_dfs is missing one of the required ' - 'dataframes.') - - df_to_hdf5(data=gtfsfeeds_dfs.stops, key='stops', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.routes, key='routes', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.trips, key='trips', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.stop_times, key='stop_times', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.calendar, key='calendar', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.stop_times_int, key='stop_times_int', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - - if gtfsfeeds_dfs.headways.empty is False: - df_to_hdf5(data=gtfsfeeds_dfs.headways, key='headways', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) + log('Writing HDF5 store...') + if not isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df): + raise ValueError('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + + req_df_dict = {'stops': gtfsfeeds_dfs.stops, + 'routes': gtfsfeeds_dfs.routes, + 'trips': gtfsfeeds_dfs.trips, + 'stop_times': gtfsfeeds_dfs.stop_times, + 'stop_times_int': gtfsfeeds_dfs.stop_times_int} + # calendar or calendar_dates are required but not both + optional_df_dict = {'headways': gtfsfeeds_dfs.headways, + 'calendar': gtfsfeeds_dfs.calendar, + 'calendar_dates': gtfsfeeds_dfs.calendar_dates} + + for name, gtfs_df in req_df_dict.items(): + if gtfs_df.empty: + raise ValueError('gtfsfeeds_dfs is missing required ' + 'DataFrame: {}.'.format(name)) + if gtfsfeeds_dfs.calendar.empty and gtfsfeeds_dfs.calendar_dates.empty: + raise ValueError('gtfsfeeds_dfs is missing either the calendar or ' + 'calendar_dates DataFrame.') - if gtfsfeeds_dfs.calendar_dates.empty is False: - df_to_hdf5(data=gtfsfeeds_dfs.calendar_dates, key='calendar_dates', + tables_saved = [] + for name, gtfs_df in req_df_dict.items(): + df_to_hdf5(data=gtfs_df, key=name, overwrite_key=False, dir=dir, filename=filename, overwrite_hdf5=False) + tables_saved.extend([name]) + + for name, gtfs_df in optional_df_dict.items(): + if gtfs_df.empty is False: + df_to_hdf5(data=gtfs_df, key=name, + overwrite_key=False, dir=dir, filename=filename, + overwrite_hdf5=False) + tables_saved.extend([name]) + + log('Saved HDF5 store: {} with tables: {}.'.format( + os.path.join(dir, filename), tables_saved)) def load_processed_gtfs_data(filename, dir=config.settings.data_folder): From 832de7a9e1c3646ae488155fb5f90ebf3d7b136e Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 13:08:29 -0700 Subject: [PATCH 19/38] refactor load_processed_gtfs_data(): simplify function, add prints, add calendar and calendar_dates as optional tables --- urbanaccess/gtfs/network.py | 56 ++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 64c3aa8..9afb3c4 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -1155,40 +1155,50 @@ def save_processed_gtfs_data( def load_processed_gtfs_data(filename, dir=config.settings.data_folder): """ - Read data from a hdf5 file to a gtfsfeeds_dfs object + Read data from a HDF5 file to an urbanaccess_gtfs_df object Parameters ---------- filename : string - name of the hdf5 file to read with .h5 extension + name of the HDF5 file to read with .h5 extension dir : string, optional - directory to read hdf5 file + directory to read HDF5 file Returns ------- gtfsfeeds_dfs : object + urbanaccess_gtfs_df object """ - gtfsfeeds_dfs.stops = hdf5_to_df(dir=dir, filename=filename, key='stops') - gtfsfeeds_dfs.routes = hdf5_to_df(dir=dir, filename=filename, key='routes') - gtfsfeeds_dfs.trips = hdf5_to_df(dir=dir, filename=filename, key='trips') - gtfsfeeds_dfs.stop_times = hdf5_to_df(dir=dir, filename=filename, - key='stop_times') - gtfsfeeds_dfs.calendar = hdf5_to_df(dir=dir, filename=filename, - key='calendar') - gtfsfeeds_dfs.stop_times_int = hdf5_to_df(dir=dir, filename=filename, - key='stop_times_int') - - hdf5_load_path = '{}/{}'.format(dir, filename) - with pd.HDFStore(hdf5_load_path) as store: + log('Loading HDF5 store...') + req_df_dict = {'stops': gtfsfeeds_dfs.stops, + 'routes': gtfsfeeds_dfs.routes, + 'trips': gtfsfeeds_dfs.trips, + 'stop_times': gtfsfeeds_dfs.stop_times, + 'stop_times_int': gtfsfeeds_dfs.stop_times_int} + # calendar or calendar_dates are required but not both + optional_df_dict = {'headways': gtfsfeeds_dfs.headways, + 'calendar': gtfsfeeds_dfs.calendar, + 'calendar_dates': gtfsfeeds_dfs.calendar_dates} - if 'headways' in store.keys(): - gtfsfeeds_dfs.headways = hdf5_to_df(dir=dir, - filename=filename, - key='headways') - if 'calendar_dates' in store.keys(): - gtfsfeeds_dfs.calendar_dates = hdf5_to_df(dir=dir, - filename=filename, - key='calendar_dates') + tables_read = [] + for name, gtfs_df in req_df_dict.items(): + vars(gtfsfeeds_dfs)[name] = hdf5_to_df( + dir=dir, filename=filename, key=name) + tables_read.extend([name]) + + # open HDF5 to read keys + hdf5_load_path = os.path.join(dir, filename) + with pd.HDFStore(hdf5_load_path) as store: + hdf5_keys = store.keys() + hdf5_keys = [item.replace('/', '') for item in hdf5_keys] + for name, gtfs_df in optional_df_dict.items(): + # if optional key exists, read it + if name in hdf5_keys: + vars(gtfsfeeds_dfs)[name] = hdf5_to_df( + dir=dir, filename=filename, key=name) + tables_read.extend([name]) + log('Read HDF5 store: {} tables: {}.'.format( + hdf5_load_path, tables_read)) return gtfsfeeds_dfs From 106c422c39b957b3efff81fa2ba26cad06535e8e Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 15:37:24 -0700 Subject: [PATCH 20/38] improve print, add TODO --- urbanaccess/gtfs/network.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 9afb3c4..e83f345 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -473,8 +473,8 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): # if there are stop times missing that need interpolation notify user if missing_stop_times_count > 0: - log('Note: Processing may take a long time depending' - ' on the number of records. ' + log('Note: Processing may take a long time depending ' + 'on the number of records. ' 'Total unique trips to assess: {:,}.'.format( len(stop_times_df['unique_trip_id'].unique())), level=lg.WARNING) @@ -493,6 +493,11 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): 'following the specified schedule. There are no records to ' 'interpolate.') + # TODO: for the rare and unlikely case when there is 1 null record and + # its not the first or last stop in the stop sequence, that value + # should be interpolated and its trip id should be added to those to be + # interpolated - this additional case would have to be benchmarked + # for speed to ensure it doesnt slow down existing process # Find trips with more than one missing time # Note: all trip ids have at least 1 null departure time because the # last stop in a trip is always null @@ -599,8 +604,11 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): num_not_interpolated = final_stop_times_df[ 'departure_time_sec_interpolate'].isnull().sum() if num_not_interpolated > 0: - log('WARNING: Number of records unable to interpolate: {:,}. ' - 'These records have been removed.'.format(num_not_interpolated), + log('WARNING: Number of stop_time records unable to interpolate: {:,}.' + ' These records likely had stops in either the start or ' + 'end sequence that did not have time information avaiable to ' + 'interpolate between. These records have been removed.'.format( + num_not_interpolated), level=lg.WARNING) # convert the interpolated times (float) to integer so all times are From 122e71bc02cf871da5e6020193497ac234f7cf0d Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 15:38:31 -0700 Subject: [PATCH 21/38] remove TODO as print is accurate in what its counting --- urbanaccess/gtfs/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index e83f345..06cd671 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -600,7 +600,6 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): final_stop_times_df['departure_time_sec_interpolate'].fillna( final_stop_times_df['departure_time_sec'], inplace=True) - # TODO: refine this count so it refers to only the data that matters num_not_interpolated = final_stop_times_df[ 'departure_time_sec_interpolate'].isnull().sum() if num_not_interpolated > 0: From 5ed3813a323a3231b6a63148929dbb7428ae2394 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 15:42:14 -0700 Subject: [PATCH 22/38] add new unit tests to gtfs.network.gtfs_network, update existing, expand coverage --- urbanaccess/tests/conftest.py | 3 +- urbanaccess/tests/test_gtfs_network.py | 1176 +++++++++++++++++++++++- 2 files changed, 1144 insertions(+), 35 deletions(-) diff --git a/urbanaccess/tests/conftest.py b/urbanaccess/tests/conftest.py index 5ca8297..a3e1db7 100644 --- a/urbanaccess/tests/conftest.py +++ b/urbanaccess/tests/conftest.py @@ -373,7 +373,8 @@ def calendar_dates_feed_1(): 'weekday-3', 'weekend-1'], 'date': [20161224, 20170318, 20160424, 20161230], - 'exception_type': [1, 2, 1, 1]} + 'exception_type': [1, 2, 1, 1], + 'schedule_type': ['WD', 'WD', 'WD', 'WE']} index = range(4) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 01de1c8..13bfb50 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -1,25 +1,61 @@ import pytest +import os +import glob import pandas as pd import numpy as np import urbanaccess.gtfs.network as gtfs_network import urbanaccess.gtfs.load as gtfs_load from urbanaccess.network import urbanaccess_network +from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df @pytest.fixture def expected_urbanaccess_network_keys(): expected_keys = ['transit_nodes', 'transit_edges', 'net_connector_edges', 'osm_nodes', 'osm_edges', 'net_nodes', 'net_edges'] - return expected_keys.sort() + return sorted(expected_keys) + + +@pytest.fixture +def expected_gtfsfeeds_dfs_keys(): + expected_keys = ['stops', 'routes', 'trips', 'stop_times', + 'calendar_dates', 'calendar', 'stop_times_int', + 'headways'] + return sorted(expected_keys) @pytest.fixture def gtfs_feed_wo_calendar_dates( - tmpdir, agency_a_feed_on_disk_wo_calendar_dates): - feed_dir = agency_a_feed_on_disk_wo_calendar_dates + agency_a_feed_on_disk_wo_calendar_dates): + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=agency_a_feed_on_disk_wo_calendar_dates, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + return loaded_feeds + + +@pytest.fixture +def gtfs_feed_wo_calendar( + agency_a_feed_on_disk_wo_calendar): + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=agency_a_feed_on_disk_wo_calendar, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + return loaded_feeds + + +@pytest.fixture +def gtfs_feed_w_calendar_and_calendar_dates( + agency_a_feed_on_disk_w_calendar_and_calendar_dates): loaded_feeds = gtfs_load.gtfsfeed_to_df( - gtfsfeed_path=feed_dir, + gtfsfeed_path=agency_a_feed_on_disk_w_calendar_and_calendar_dates, validation=False, verbose=True, bbox=None, @@ -28,36 +64,79 @@ def gtfs_feed_wo_calendar_dates( return loaded_feeds +@pytest.fixture +def selected_int_stop_times_from_feed_wo_calendar_dates( + gtfs_feed_wo_calendar_dates): + # reproduce what is expected as the 'selected_interpolated_stop_times_df' + stop_times = gtfs_feed_wo_calendar_dates.stop_times.copy() + stop_times = stop_times.loc[stop_times['trip_id'] == 'a3'] + stop_times['unique_stop_id'] = ( + stop_times['stop_id'].str.cat( + stop_times['unique_agency_id'].astype('str'), sep='_')) + stop_times['unique_trip_id'] = ( + stop_times['trip_id'].str.cat( + stop_times['unique_agency_id'].astype('str'), sep='_')) + data = { + 'departure_time_sec_interpolate': [29700, 30000, 30300, + 30600, 30900, 31200], + 'timediff': [np.nan, 300.0, 300.0, 300.0, 300.0, 300.0] + } + index = range(12, 18) + df = pd.DataFrame(data, index) + stop_times = pd.concat([stop_times, df], axis=1) + + return stop_times + + +@pytest.fixture +def selected_stops_from_feed_wo_calendar_dates(gtfs_feed_wo_calendar_dates): + # create 'final_selected_stops' df that is used as input to test function + stops_df = gtfs_feed_wo_calendar_dates.stops.copy() + stops_df = stops_df.iloc[0:6] + stops_df['unique_stop_id'] = ( + stops_df['stop_id'].str.cat( + stops_df['unique_agency_id'].astype('str'), sep='_')) + stops_df.set_index('unique_stop_id', drop=True, inplace=True) + stops_df.index.name = 'node_id' + return stops_df + + @pytest.fixture def stop_times(): data = { - 'unique_agency_id': ['citytrains'] * 25, + 'unique_agency_id': ['citytrains'] * 35, 'trip_id': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', - 'e', 'e', 'e', 'e', 'e'], - 'stop_id': str(range(25)), + 'e', 'e', 'e', 'e', 'e', + 'f', 'f', 'f', 'f', 'f', + 'g', 'g', 'g', 'g', 'g'], + 'stop_id': range(1, 36), 'departure_time_sec': [1, 2, np.nan, np.nan, 5, 1, 2, 3, 4, np.nan, np.nan, np.nan, 3, 4, np.nan, 1, 2, 3, 4, 5, - 1, np.nan, 3, 4, np.nan], - 'stop_sequence': [1, 2, 3, 4, 5] * 5 + 1, np.nan, 3, 4, np.nan, + 1, np.nan, 3, 4, 5, + np.nan, 2, 3, 4, 5], + 'stop_sequence': [1, 2, 3, 4, 5] * 7 } - index = range(25) + index = range(35) df = pd.DataFrame(data, index) + df['stop_id'] = df['stop_id'].astype('str') + return df @pytest.fixture def calendar(): data = { - 'unique_agency_id': ['citytrains'] * 4, - 'trip_id': ['a', 'b', 'c', 'e'] + 'unique_agency_id': ['citytrains'] * 6, + 'trip_id': ['a', 'b', 'c', 'e', 'f', 'g'] } - index = range(4) + index = range(6) df = pd.DataFrame(data, index) return df @@ -105,9 +184,151 @@ def stop_times_interpolated(): return df +@pytest.fixture +def transit_edge_from_feed_wo_calendar_dates(): + data = { + 'node_id_from': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a'], + 'node_id_to': ['2_agency_a_city_a', '3_agency_a_city_a', + '4_agency_a_city_a', '5_agency_a_city_a', + '6_agency_a_city_a'], + 'weight': [300.0] * 5, + 'unique_agency_id': ['agency_a_city_a'] * 5, + 'unique_trip_id': ['a3_agency_a_city_a'] * 5, + 'sequence': range(1, 6), + 'id': ['a3_agency_a_city_a_1', 'a3_agency_a_city_a_2', + 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', + 'a3_agency_a_city_a_5'], + } + index = range(5) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + # represents df prior to being post-processed downstream + df = expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + df.drop(columns=['route_type'], inplace=True) + # convert weight from min to sec to represent df prior to post-process step + df['weight'] = 300.0 + return df + + +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2(): + # represents df after it has been post-processed downstream + data = { + 'node_id_from': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a'], + 'node_id_to': ['2_agency_a_city_a', '3_agency_a_city_a', + '4_agency_a_city_a', '5_agency_a_city_a', + '6_agency_a_city_a'], + 'weight': [5.0] * 5, + 'unique_agency_id': ['agency_a_city_a'] * 5, + 'unique_trip_id': ['a3_agency_a_city_a'] * 5, + 'sequence': range(1, 6), + 'id': ['a3_agency_a_city_a_1', 'a3_agency_a_city_a_2', + 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', + 'a3_agency_a_city_a_5'], + 'route_type': [3] * 5 + } + index = range(5) + df = pd.DataFrame(data, index) + # raw data are read as int32 + df['sequence'] = df['sequence'].astype('int32') + return df + + +@pytest.fixture +def expected_final_transit_edge_from_feed_wo_calendar_dates( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + data = { + 'unique_route_id': ['10-101_agency_a_city_a'] * 5, + 'net_type': ['transit'] * 5 + } + index = range(5) + df = pd.DataFrame(data, index) + df = pd.concat( + [expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2, df], + axis=1) + return df + + +@pytest.fixture +def expected_transit_node_from_feed_wo_calendar_dates(): + data = { + 'node_id': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a', '6_agency_a_city_a'], + 'x': [-122.265609, -122.224274, -122.271604, -122.269029, -122.267227, + -122.251793], + 'y': [37.797484, 37.774963, 37.803664, 37.80787, 37.828415, 37.844601], + 'unique_agency_id': ['agency_a_city_a'] * 6, + 'route_type': [3] * 6, + 'stop_id': range(1, 7), + 'stop_name': ['ave a', 'ave b', 'ave c', 'ave d', 'ave e', 'ave f'], + 'wheelchair_boarding': [1, 0, 0, 0, 0, 0], + 'location_type': [1] * 6 + } + index = range(6) + + df = pd.DataFrame(data, index) + df['stop_id'] = df['stop_id'].astype('str') + df.set_index('node_id', drop=True, inplace=True) + return df + + +@pytest.fixture +def edge_route_type_impedance_df(): + data = { + 'weight': [2, 2, 2, 3, 3, 3, 5, 5, 5, 5], + 'route_type': [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] + } + index = range(10) + + df = pd.DataFrame(data, index) + + return df + + +@pytest.fixture() +def hdf5_file_on_disk_gtfsfeeds_dfs( + tmpdir, + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + hdf5_dict = {'stop_times': gtfs_feed_wo_calendar_dates.stop_times, + 'stops': gtfs_feed_wo_calendar_dates.stops, + 'routes': gtfs_feed_wo_calendar_dates.routes, + 'trips': gtfs_feed_wo_calendar_dates.trips, + 'calendar': gtfs_feed_wo_calendar_dates.calendar, + 'stop_times_int': + selected_int_stop_times_from_feed_wo_calendar_dates} + hdf5_save_path = os.path.join(tmpdir.strpath, 'test_hdf5_load') + hdf5_file = os.path.join(hdf5_save_path, 'test_file.h5') + os.makedirs(hdf5_save_path) + print('writing test HDF5 to: {}'.format(hdf5_file)) + # create the HDF5 + store = pd.HDFStore(hdf5_file) + store.close() + # add keys and DFs to HDF5 + for key, df in hdf5_dict.items(): + store = pd.HDFStore(hdf5_file, mode='r') + store.close() + df.to_hdf(hdf5_file, key=key, mode='a', format='table') + return hdf5_save_path + + def test_create_transit_net_wo_calendar_dates( - tmpdir, gtfs_feed_wo_calendar_dates, - expected_urbanaccess_network_keys): + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates.copy() transit_net = gtfs_network.create_transit_net( gtfs_feed_wo_calendar_dates, day='monday', timerange=['07:00:00', '10:00:00'], @@ -115,22 +336,67 @@ def test_create_transit_net_wo_calendar_dates( overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) assert isinstance(transit_net, urbanaccess_network) urbanaccess_network_info = vars(transit_net) expected_dfs = ['transit_nodes', 'transit_edges'] - assert expected_urbanaccess_network_keys == list( - urbanaccess_network_info.keys()).sort() + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) for key, value in urbanaccess_network_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty if key in expected_dfs: assert value.empty is False + result_edge = transit_net.transit_edges.copy() + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + assert result_edge.equals(expected_result) -def test_create_transit_net_wo_req_file( - tmpdir, gtfs_feed_wo_calendar_dates): + +def test_create_transit_net_wo_direction_id( + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates.copy() + # remove 'direction_id' col for test + gtfs_feed_wo_calendar_dates.trips.drop( + columns=['direction_id'], inplace=True) + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + assert isinstance(transit_net, urbanaccess_network) + urbanaccess_network_info = vars(transit_net) + expected_dfs = ['transit_nodes', 'transit_edges'] + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) + for key, value in urbanaccess_network_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + result_edge = transit_net.transit_edges.copy() + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + assert result_edge.equals(expected_result) + + +def test_create_transit_net_wo_req_file(gtfs_feed_wo_calendar_dates): # set trips df to blank df for test gtfs_feed_wo_calendar_dates.trips = pd.DataFrame() with pytest.raises(ValueError) as excinfo: @@ -141,35 +407,239 @@ def test_create_transit_net_wo_req_file( overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) expected_error = ( - "one of the following gtfsfeeds_dfs objects trips, stops, " + "One of the following gtfsfeeds_dfs objects: trips, stops, " "or stop_times were found to be empty.") assert expected_error in str(excinfo.value) def test_create_transit_net_wo_calendar_and_calendar_dates( - tmpdir, gtfs_feed_wo_calendar_dates): + gtfs_feed_wo_calendar_dates): # set calendar_dates and calendar dfs to blank df for test gtfs_feed_wo_calendar_dates.calendar_dates = pd.DataFrame() gtfs_feed_wo_calendar_dates.calendar = pd.DataFrame() with pytest.raises(ValueError) as excinfo: transit_net = gtfs_network.create_transit_net( gtfs_feed_wo_calendar_dates, day='monday', - timerange=['07:00:00', '10:00:00'], + timerange=['07:00:00', '11:00:00'], calendar_dates_lookup=None, overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) expected_error = ( - "one of the following gtfsfeeds_dfs objects calendar or " + "One of the following gtfsfeeds_dfs objects: calendar or " "calendar_dates were found to be empty.") assert expected_error in str(excinfo.value) +def test_create_transit_net_invalid_params(gtfs_feed_wo_calendar_dates): + msg = ('starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: ' + '08:00:00 or 17:00:00.') + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['7:00:0', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['7:00:0', '10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange='10:00:00', + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("10:00:00 {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=[100000, 170000], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("[100000, 170000] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['10:00:00', '07:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['10:00:00', '07:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=2, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "overwrite_existing_stop_times_int must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=2, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "use_existing_stop_times_int must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=2, + save_dir=None, + save_filename=None) + expected_error = "save_processed_gtfs must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + None, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "gtfsfeeds_dfs must be an urbanaccess_gtfs_df object." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=True, + use_existing_stop_times_int=True, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ('overwrite_existing_stop_times_int and ' + 'use_existing_stop_times_int cannot both be True.') + assert expected_error in str(excinfo.value) + + +def test_create_transit_net_overwrite_stop_times_int_True( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # populate stop_times_int for test that is different than the one that + # would be calculated + df = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + df['timediff'] = df['timediff'] * 2 + gtfs_feed_wo_calendar_dates.stop_times_int = df + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=True, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + # values should be different given overwrite_existing_stop_times_int = True + assert gtfs_feed_wo_calendar_dates.stop_times_int['timediff'].equals( + df['timediff']) is False + + +def test_create_transit_net_use_existing_stop_times_int_True( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # populate stop_times_int for test that is different than the one that + # would be calculated + df = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + df['timediff'] = df['timediff'] * 2 + gtfs_feed_wo_calendar_dates.stop_times_int = df + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=True, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + # values should be the the same since use_existing_stop_times_int = True + assert gtfs_feed_wo_calendar_dates.stop_times_int['timediff'].equals( + df['timediff']) + + +def test_create_transit_net_save_processed_gtfs_True( + tmpdir, gtfs_feed_wo_calendar_dates): + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=True, + save_dir=dir_path, + save_filename='test_file.h5') + + # test that file was written as expected + file_list = glob.glob(r"{}/*.h5".format(dir_path)) + file_path = file_list[0] + file_name = os.path.basename(file_path) + assert file_name == 'test_file.h5' + # test HDF5 store + expected_keys = {'/calendar', '/routes', '/stop_times', '/stop_times_int', + '/stops', '/trips'} + with pd.HDFStore(file_path) as store: + result_keys = set(store.keys()) + assert result_keys == expected_keys + # check that data exists in each DataFrame + for key in expected_keys: + df = store[key] + assert df.empty is False + + def test_interpolator(stop_times, calendar): df = gtfs_network._interpolate_stop_times(stop_times, calendar) @@ -180,8 +650,8 @@ def test_interpolator(stop_times, calendar): assert df.loc[df.trip_id == 'a', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4, 5] - # trip 'b' should be skipped because it has only one null value - # but its null value should be removed + # trip 'b' should be skipped because it has only one null value and + # its in the last position but its null value should be removed assert df.loc[df.trip_id == 'b', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4] @@ -198,29 +668,330 @@ def test_interpolator(stop_times, calendar): assert df.loc[df.trip_id == 'e', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4] + # TODO: This is a rare and unlikely case that should be supported + # in the future and when addressed we expect [1, 2, 3, 4, 5] for trip 'f' + # trip 'f' should be interpolated fully, + # the one NA in the middle of the sequence should be filled + # trip 'f' should be skipped because it has only one null value and + # its not a first or last value in sequence, but its null value should + # be removed + assert df.loc[df.trip_id == 'f', + 'departure_time_sec_interpolate'].tolist() == [1, 3, 4, 5] + + # trip 'g' should be interpolated + # no starting value, so first time removed + # NaN values should be removed from start + assert df.loc[df.trip_id == 'g', + 'departure_time_sec_interpolate'].tolist() == [2, 3, 4, 5] + + +def test_interpolator_w_missing_stop_sequence(stop_times, calendar): + # create nulls in stop_times 'stop_sequence' col + stop_times['stop_sequence'][1:4] = np.nan + stop_times['stop_sequence'][10:12] = np.nan + with pytest.raises(ValueError) as excinfo: + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + expected_error = ("Found duplicate values when values from stop_sequence " + "and unique_trip_id are combined. Check values in " + "these columns for trip_id(s): " + "['a_citytrains', 'c_citytrains'].") + assert expected_error in str(excinfo.value) + + +def test_interpolator_w_mismatch_trip_ids(stop_times, calendar): + # create nulls in stop_times 'stop_sequence' col + stop_times['trip_id'] = stop_times['trip_id'] + ' ' + + with pytest.raises(ValueError) as excinfo: + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + expected_error = ("No matching trip_ids where found. " + "Suggest checking for differences between trip_id " + "values in stop_times and trips GTFS files.") + assert expected_error in str(excinfo.value) + + +def test_interpolator_w_index_as_col(stop_times, calendar): + # set name on index that also exists as a col to run test + stop_times.index.rename('unique_agency_id', inplace=True) + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + # no errors should occur so only need to check df is not empty + assert df.empty is False + def test_skip_interpolator(stop_times, calendar): series = pd.Series(data=[1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5], - index=range(25), - name='departure_time_sec') - + index = range(35), + name = 'departure_time_sec') stop_times['departure_time_sec'] = series - df = gtfs_network._interpolate_stop_times(stop_times, calendar) # everything should be the same, # with one row dropped for calendar day filter assert df.departure_time_sec_interpolate.tolist() == [1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5] -def test_edge_reformatter(stop_times_interpolated): +def test_trip_schedule_selector_wo_cal_dates(gtfs_feed_wo_calendar_dates): + expected_result = gtfs_feed_wo_calendar_dates.trips.copy() + # create expected trips result + expected_result.reset_index(drop=True, inplace=True) + expected_result = expected_result.iloc[0:8] + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar_dates.trips, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_wo_cal_dates_wo_direction_id( + gtfs_feed_wo_calendar_dates): + # remove 'direction_id' col for test + trips_df = gtfs_feed_wo_calendar_dates.trips.copy() + trips_df.drop(columns=['direction_id'], inplace=True) + expected_result = gtfs_feed_wo_calendar_dates.trips.copy() + # create expected trips result + expected_result.reset_index(drop=True, inplace=True) + expected_result.drop(columns=['direction_id'], inplace=True) + expected_result = expected_result.iloc[0:8] + + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_dates(gtfs_feed_wo_calendar): + expected_result = gtfs_feed_wo_calendar.trips.copy() + # create expected trips result + expected_result = expected_result.iloc[4:10] + expected_result.reset_index(drop=True, inplace=True) + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar.calendar_dates, + day='sunday', + calendar_dates_lookup={'schedule_type': 'WE', + 'service_id': ['weekday-3', 'weekday-2']}) + + assert len(result) == 6 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_and_cal_dates( + gtfs_feed_w_calendar_and_calendar_dates): + trips_df = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + cal_df = gtfs_feed_w_calendar_and_calendar_dates.calendar.copy() + cal_dates_df = gtfs_feed_w_calendar_and_calendar_dates.calendar_dates \ + .copy() + expected_result = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df, + input_calendar_df=cal_df, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'schedule_type': 'WE'}) + + assert len(result) == 10 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_and_cal_dates_wo_lookup( + gtfs_feed_w_calendar_and_calendar_dates): + trips_df_1 = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + cal_df = gtfs_feed_w_calendar_and_calendar_dates.calendar.copy() + cal_dates_df_1 = gtfs_feed_w_calendar_and_calendar_dates.calendar_dates \ + .copy() + # create extra records in trips and calendar_dates for a different agency + # that do not exist in the calendar table + trips_df_2 = trips_df_1.copy() + trips_df_2['unique_agency_id'] = trips_df_2['unique_agency_id'] + '_x' + trips_df_2['unique_feed_id'] = trips_df_2['unique_feed_id'] + '_x' + trips_df_2 = trips_df_2.iloc[0:8] + trips_df_x2 = pd.concat( + [trips_df_1, trips_df_2], axis=0, + ignore_index=True) + cal_dates_df_2 = cal_dates_df_1.copy() + cal_dates_df_2['unique_agency_id'] = \ + cal_dates_df_2['unique_agency_id'] + '_x' + cal_dates_df_2['unique_feed_id'] = \ + cal_dates_df_2['unique_feed_id'] + '_x' + cal_dates_df_x2 = pd.concat( + [cal_dates_df_1, cal_dates_df_2], axis=0, + ignore_index=True) + # create expected trips result + expected_result = trips_df_1.copy() + expected_result = expected_result.iloc[0:8] + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df_x2, + input_calendar_df=cal_df, + input_calendar_dates_df=cal_dates_df_x2, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_wo_cal_dates_invalid_params( + gtfs_feed_wo_calendar_dates): + gtfs_feed = gtfs_feed_wo_calendar_dates + # test with invalid 'day' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday ', + calendar_dates_lookup=None) + expected_error = ( + "Incorrect day specified. Must be one of lowercase strings: 'monday'," + " 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'.") + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup=['invalid']) + expected_error = "calendar_dates_lookup parameter must be a dictionary." + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={1: 'WD'}) + expected_error = "calendar_dates_lookup key: 1 must be a string." + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': 1}) + expected_error = ("calendar_dates_lookup value: 1 must be a " + "string or a list of strings.") + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': ['WD', 1]}) + expected_error = ("calendar_dates_lookup value: ['WD', 1] " + "must contain strings.") + assert expected_error in str(excinfo.value) + + +def test_trip_schedule_selector_w_cal_dates_invalid_params_1( + gtfs_feed_wo_calendar_dates): + # test with empty 'calendar_dates'df + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar_dates.trips, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': 'WD'}) + expected_error = ("calendar_dates_df is empty. Unable to use the " + "calendar_dates_lookup parameter.") + assert expected_error in str(excinfo.value) + + +def test_trip_schedule_selector_w_cal_dates_invalid_params_2( + gtfs_feed_wo_calendar): + # create invalid data in calendar dates file + cal_dates_df = gtfs_feed_wo_calendar.calendar_dates.copy() + series = pd.Series( + data=[1, 1, 2, 2], index=range(4), + name='invalid_dtype') + cal_dates_df['invalid_dtype'] = series + series = pd.Series( + data=[10, 11, 10, 'aa'], index=range(4), + name='day_type') + cal_dates_df['day_type'] = series + + # test with invalid col in 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'invalid_col': 'WD'}) + expected_error = ("Column: invalid_col not found in calendar_dates " + "dataframe.") + assert expected_error in str(excinfo.value) + # test with invalid col dtype in 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'invalid_dtype': '1'}) + expected_error = ("Column: invalid_dtype must be object type.") + assert expected_error in str(excinfo.value) + + +def test_time_selector(selected_int_stop_times_from_feed_wo_calendar_dates): + timerange = ['08:20:00', '08:35:00'] + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + result = gtfs_network._time_selector( + df=stop_times_int, + starttime=timerange[0], + endtime=timerange[1]) + + # create expected subset result + expected_result = stop_times_int.loc[14:15] + assert len(result) == 2 + assert result.equals(expected_result) + + +def test_time_difference(selected_int_stop_times_from_feed_wo_calendar_dates): + expected_result = \ + selected_int_stop_times_from_feed_wo_calendar_dates.copy() + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + # create the 'stop_times_int' df expected + stop_times_int.drop(columns=['timediff'], inplace=True) + result = gtfs_network._time_difference(stop_times_df=stop_times_int) + + assert 'timediff' in result.columns + # all rows in sequence should not be null + assert result['timediff'][1:6].isnull().sum() == 0 + # only the first row in sequence should be null + assert result['timediff'][0:1].isnull().sum() == 1 + assert result.equals(expected_result) + + +def test_format_transit_net_edge_test_1(stop_times_interpolated): df = gtfs_network._format_transit_net_edge(stop_times_interpolated) # length of edge df should be 16 @@ -231,7 +1002,8 @@ def test_edge_reformatter(stop_times_interpolated): # edge df should have these columns and no null values for col in ['node_id_from', 'node_id_to', 'weight']: - assert col in df.columns and df[col].isnull().values.any() == False # noqa + assert col in df.columns and df[ + col].isnull().values.any() == False # noqa # there should be 4 edges per trip id for i, row in df.groupby('unique_trip_id').size().iteritems(): @@ -249,3 +1021,339 @@ def test_edge_reformatter(stop_times_interpolated): 'unique_trip_id'][11] and \ df['unique_agency_id'][8] == stop_times_interpolated[ 'unique_agency_id'][11] # noqa + + +def test_format_transit_net_edge_test_2( + selected_int_stop_times_from_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1.copy() + + # create the 'selected_interpolated_stop_times_df' that is expected + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + # there are no missing time values in the test data so just use + # 'departure_time_sec' to generate the timediff col for the test + stop_times_int['timediff'] = stop_times_int.groupby('unique_trip_id')[ + 'departure_time_sec'].diff() + result = gtfs_network._format_transit_net_edge(stop_times_int) + + # test that output df is identical to expected df + result = result.reindex( + sorted(result.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + assert result.equals(expected_result) + + +def test_convert_imp_time_units( + transit_edge_from_feed_wo_calendar_dates): + # test with minutes + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='minutes') + expected_weight_as_min = pd.Series( + data=[5.0] * 5, index=range(5), name='weight') + assert result_min['weight'].equals(expected_weight_as_min) + + # test with seconds + # convert original weight of min to sec + transit_edge_from_feed_wo_calendar_dates['weight'] = expected_weight_as_min + result_sec = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='seconds') + expected_weight_as_sec = pd.Series( + data=[300.0] * 5, index=range(5), name='weight') + assert result_sec['weight'].equals(expected_weight_as_sec) + + +def test_convert_imp_time_units_invalid_params( + transit_edge_from_feed_wo_calendar_dates): + # test with invalid 'convert_to' param name + with pytest.raises(ValueError) as excinfo: + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='minutes_invalid') + expected_error = ("minutes_invalid is not a valid value " + "or is not a string.") + assert expected_error in str(excinfo.value) + # test with invalid 'convert_to' dtype + with pytest.raises(ValueError) as excinfo: + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to=22) + expected_error = "22 is not a valid value or is not a string." + assert expected_error in str(excinfo.value) + + +def test_stops_in_edge_table_selector( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # created expected result + expected_result = gtfs_feed_wo_calendar_dates.stops[0:6] + expected_result['unique_stop_id'] = ( + expected_result['stop_id'].str.cat( + expected_result['unique_agency_id'].astype('str'), sep='_')) + + result = gtfs_network._stops_in_edge_table_selector( + input_stops_df=gtfs_feed_wo_calendar_dates.stops, + input_stop_times_df=selected_int_stop_times_from_feed_wo_calendar_dates + ) + + assert 'unique_stop_id' in result.columns + assert result['unique_stop_id'].isnull().sum() == 0 + assert result.equals(expected_result) + + +def test_format_transit_net_nodes( + selected_stops_from_feed_wo_calendar_dates, + expected_transit_node_from_feed_wo_calendar_dates): + expected_result = expected_transit_node_from_feed_wo_calendar_dates.copy() + expected_cols = ['x', 'y', 'unique_agency_id', 'route_type', 'stop_id', + 'stop_name'] + + result = gtfs_network._format_transit_net_nodes( + df=selected_stops_from_feed_wo_calendar_dates) + + for col in expected_cols: + assert col in result.columns + assert result[col].isnull().sum() == 0 + assert result.index.name == 'node_id' + assert result.index.isnull().sum() == 0 + # round result to ensure decimal place match + result['x'] = result['x'].round(decimals=6) + result['y'] = result['y'].round(decimals=6) + # test that output df is identical to expected df + # re-sort cols so they are in same order for test + expected_result.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) + assert result.equals(expected_result) + + +def test_route_type_to_edge( + gtfs_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + input_edge_df = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + + # 'route_type' is added in this function and is not expected to already + # exist + input_edge_df.drop(columns=['route_type'], inplace=True) + + result = gtfs_network._route_type_to_edge( + transit_edge_df=input_edge_df, + stop_time_df=gtfs_feed_wo_calendar_dates.stop_times) + assert 'route_type' in result.columns + assert result['route_type'].isnull().sum() == 0 + # re-sort cols so they are in same order for test + expected_result.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) + assert result.equals(expected_result) + + +def test_route_id_to_edge( + gtfs_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + series = pd.Series( + data=['10-101_agency_a_city_a'] * 5, index=range(5), + name='unique_route_id') + expected_result['unique_route_id'] = series + input_edge_df = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + + result = gtfs_network._route_id_to_edge( + transit_edge_df=input_edge_df, + trips_df=gtfs_feed_wo_calendar_dates.trips) + assert 'unique_route_id' in result.columns + assert result['unique_route_id'].isnull().sum() == 0 + assert result.equals(expected_result) + + +def test_check_if_index_name_in_cols_False( + selected_stops_from_feed_wo_calendar_dates): + result = gtfs_network._check_if_index_name_in_cols( + selected_stops_from_feed_wo_calendar_dates) + assert isinstance(result, bool) + assert result is False + + +def test_check_if_index_name_in_cols_True( + selected_stops_from_feed_wo_calendar_dates): + selected_stops_from_feed_wo_calendar_dates.reset_index(inplace=True) + selected_stops_from_feed_wo_calendar_dates.set_index( + 'node_id', drop=False, inplace=True) + + result = gtfs_network._check_if_index_name_in_cols( + selected_stops_from_feed_wo_calendar_dates) + assert isinstance(result, bool) + assert result is True + + +def test_edge_impedance_by_route_type(edge_route_type_impedance_df): + df = edge_route_type_impedance_df.copy() + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=0.5, + intercity_rail=-0.5) + # route_id 1 weight should increase via multiplier + assert (result.weight.iloc[0:3] == df.weight.iloc[0:3] + ( + df.weight.iloc[0:3] * 0.5)).all() + # route_id 2 weight should decrease via multiplier + assert (result.weight.iloc[3:6] == df.weight.iloc[3:6] + ( + df.weight.iloc[3:6] * -0.5)).all() + # route_id 3 weight should not change + assert (result.weight.iloc[6:9] == df.weight.iloc[6:9]).all() + + +def test_edge_impedance_by_route_type_invalid_params( + edge_route_type_impedance_df): + # test with multiplier outside of optimal range + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=2.0, + intercity_rail=-3.0) + # should return a result even if multiplier is not in optimal range + assert result.empty is False + # test with weight param as invalid dtype + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name=2, + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = "travel_time_col_name must be a string." + assert expected_error in str(excinfo.value) + # test with weight param as invalid dtype + # create str weight column + edge_route_type_impedance_df['travel_time'] = '1' + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name='travel_time', + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = "travel_time must be a number." + assert expected_error in str(excinfo.value) + # test with weight column that cant be found in DataFrame + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name='time', + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = ("Column: time was not found in " + "transit_edge_df DataFrame and is required.") + assert expected_error in str(excinfo.value) + # test with multiplier value as str + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail='1', + intercity_rail=-0.5) + expected_error = "One or more multiplier variables are not float." + assert expected_error in str(excinfo.value) + # test with route type that is not found in DataFrame + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=0.5, + funicular=-0.5) + # should return a result even if route type is not found in DataFrame + assert result.empty is False + + +def test_save_processed_gtfs_data( + tmpdir, + selected_int_stop_times_from_feed_wo_calendar_dates, + gtfs_feed_wo_calendar_dates): + # add stop_times_int to UA object which is required for saving HDF5 + gtfs_feed_wo_calendar_dates.stop_times_int = \ + selected_int_stop_times_from_feed_wo_calendar_dates + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + # test that file was written as expected + file_list = glob.glob(r"{}/*.h5".format(dir_path)) + file_path = file_list[0] + file_name = os.path.basename(file_path) + assert file_name == 'test_file.h5' + # test HDF5 store + expected_keys = {'/calendar', '/routes', '/stop_times', '/stop_times_int', + '/stops', '/trips'} + with pd.HDFStore(file_path) as store: + result_keys = set(store.keys()) + assert result_keys == expected_keys + # check that data exists in each DataFrame + for key in expected_keys: + df = store[key] + assert df.empty is False + + +def test_save_processed_gtfs_data_invalid_params( + tmpdir, + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + # test with missing req DataFrame: stop_times_int + gtfs_feed_wo_calendar_dates.stop_times_int = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs is missing required ' + 'DataFrame: stop_times_int.') + assert expected_error in str(excinfo.value) + + # set stop_times_int df for test + gtfs_feed_wo_calendar_dates.stop_times_int = \ + selected_int_stop_times_from_feed_wo_calendar_dates + # set calendar df to blank df for test + gtfs_feed_wo_calendar_dates.calendar = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs is missing either the calendar or ' + 'calendar_dates DataFrame.') + assert expected_error in str(excinfo.value) + + # test with incorrect dtype as param + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + 'invalid_param', + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + assert expected_error in str(excinfo.value) + + +def test_load_processed_gtfs_data( + hdf5_file_on_disk_gtfsfeeds_dfs, expected_gtfsfeeds_dfs_keys): + gtfsfeeds_dfs = gtfs_network.load_processed_gtfs_data( + filename='test_file.h5', dir=hdf5_file_on_disk_gtfsfeeds_dfs) + assert isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(gtfsfeeds_dfs) + + assert expected_gtfsfeeds_dfs_keys == sorted( + list(urbanaccess_gtfs_df_info.keys())) + # headways and calendar_dates were not written to HDF5 so we dont + # expect them in this test + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar', + 'stop_times_int'] + expected_dfs_empty = ['calendar_dates', 'headways'] + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + # check that df is empty + if key in expected_dfs_empty: + assert value.empty From 01ccf7202d8cd1312c0c355f91286a3fafa1c26b Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 17:09:10 -0700 Subject: [PATCH 23/38] fix minor typos --- urbanaccess/config.py | 6 ++-- urbanaccess/gtfs/headways.py | 4 +-- urbanaccess/gtfs/network.py | 42 +++++++++++++------------- urbanaccess/gtfs/utils_format.py | 40 ++++++++++++------------ urbanaccess/gtfsfeeds.py | 4 +-- urbanaccess/network.py | 23 +++++++------- urbanaccess/osm/network.py | 2 +- urbanaccess/plot.py | 12 ++++---- urbanaccess/tests/test_gtfs_network.py | 4 +-- urbanaccess/utils.py | 8 ++--- 10 files changed, 72 insertions(+), 73 deletions(-) diff --git a/urbanaccess/config.py b/urbanaccess/config.py index 273030c..9902ff2 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -4,7 +4,7 @@ def _format_check(settings): """ - Check the format of a urbanaccess_config object. + Check the format of an urbanaccess_config object. Parameters ---------- @@ -84,7 +84,7 @@ def __init__(self, def from_yaml(cls, configdir='configs', yamlname='urbanaccess_config.yaml'): """ - Create a urbanaccess_config instance from a saved YAML configuration. + Create an urbanaccess_config instance from a saved YAML configuration. Parameters ---------- @@ -143,7 +143,7 @@ def to_dict(self): def to_yaml(self, configdir='configs', yamlname='urbanaccess_config.yaml', overwrite=False): """ - Save a urbanaccess_config representation to a YAML file. + Save an urbanaccess_config representation to a YAML file. Parameters ---------- diff --git a/urbanaccess/gtfs/headways.py b/urbanaccess/gtfs/headways.py index 0db8b08..2e4b8ca 100644 --- a/urbanaccess/gtfs/headways.py +++ b/urbanaccess/gtfs/headways.py @@ -87,7 +87,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df, """ start_time = time.time() - # add unique trip and route id + # add unique trip and route ID trips_df['unique_trip_id'] = ( trips_df['trip_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) @@ -105,7 +105,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df, trips_df = trips_df[columns] - # add unique route id + # add unique route ID routes_df['unique_route_id'] = ( routes_df['route_id'].str.cat( routes_df['unique_agency_id'].astype('str'), sep='_')) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 06cd671..ba87c28 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -248,7 +248,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, raise ValueError('calendar_dates_lookup value: {} ' 'must contain strings.'.format(value)) - # create unique service ids + # create unique service IDs df_list = [input_trips_df, input_calendar_df] # if input_calendar_dates_df is not empty then add it to processing if input_calendar_dates_df.empty is False: @@ -259,7 +259,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, df['unique_agency_id'].astype('str'), sep='_')) df_list[index] = df - # select service ids where day specified has a 1 = service runs on that day + # select service IDs where day specified has a 1 = service runs on that day log('Using calendar to extract service_ids to select trips...') input_calendar_df = input_calendar_df[(input_calendar_df[day] == 1)] input_calendar_df = input_calendar_df[['unique_service_id']] @@ -380,8 +380,8 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, input_calendar_df = input_calendar_df.append(subset_result_df) input_calendar_df.drop_duplicates(inplace=True) - # select and create df of trips that match the service ids for the day of - # the week specified merge calendar df that has service ids for + # select and create df of trips that match the service IDs for the day of + # the week specified merge calendar df that has service IDs for # specified day with trips df calendar_selected_trips_df = input_trips_df.loc[ input_trips_df['unique_service_id'].isin( @@ -430,7 +430,7 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): start_time = time.time() - # create unique trip ids + # create unique trip IDs df_list = [calendar_selected_trips_df, stop_times_df] for index, df in enumerate(df_list): @@ -451,10 +451,10 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], inplace=True) - # make list of unique trip ids from the calendar_selected_trips_df + # make list of unique trip IDs from the calendar_selected_trips_df uniquetriplist = calendar_selected_trips_df[ 'unique_trip_id'].unique().tolist() - # select trip ids that match the trips in the + # select trip IDs that match the trips in the # calendar_selected_trips_df -- resulting df will be stop times # only for trips that run on the service day or dates of interest stop_times_df = stop_times_df[ @@ -495,11 +495,11 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): # TODO: for the rare and unlikely case when there is 1 null record and # its not the first or last stop in the stop sequence, that value - # should be interpolated and its trip id should be added to those to be + # should be interpolated and its trip ID should be added to those to be # interpolated - this additional case would have to be benchmarked # for speed to ensure it doesnt slow down existing process # Find trips with more than one missing time - # Note: all trip ids have at least 1 null departure time because the + # Note: all trip IDs have at least 1 null departure time because the # last stop in a trip is always null null_times = stop_times_df[stop_times_df.departure_time_sec.isnull()] trips_with_null = null_times.unique_trip_id.value_counts() @@ -620,7 +620,7 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): final_stop_times_df['departure_time_sec_interpolate'] = \ final_stop_times_df['departure_time_sec_interpolate'].astype(int) - # add unique stop id + # add unique stop ID final_stop_times_df['unique_stop_id'] = ( final_stop_times_df['stop_id'].str.cat( final_stop_times_df['unique_agency_id'].astype('str'), sep='_')) @@ -649,7 +649,7 @@ def _time_difference(stop_times_df): """ start_time = time.time() - # calculate difference between consecutive records grouping by trip id. + # calculate difference between consecutive records grouping by trip ID stop_times_df['timediff'] = stop_times_df.groupby('unique_trip_id')[ 'departure_time_sec_interpolate'].diff() log('Difference between stop times has been successfully calculated. ' @@ -746,11 +746,11 @@ def _format_transit_net_edge(stop_times_df): "weight": tmp_trip_df['timediff'].iloc[1:].values, "unique_agency_id": tmp_trip_df[ 'unique_agency_id'].iloc[1:].values, - # set unique trip id without edge order to join other data later + # set unique trip ID without edge order to join other data later "unique_trip_id": trip }) - # Set current trip id to edge id column adding edge order at + # Set current trip ID to edge ID column adding edge order at # end of string edge_df['sequence'] = (edge_df.index + 1).astype(int) @@ -824,12 +824,12 @@ def _stops_in_edge_table_selector(input_stops_df, input_stop_times_df): """ start_time = time.time() - # add unique stop id + # add unique stop ID input_stops_df['unique_stop_id'] = ( input_stops_df['stop_id'].str.cat( input_stops_df['unique_agency_id'].astype('str'), sep='_')) - # Select stop ids that match stop ids in the subset stop time data that + # Select stop IDs that match stop IDs in the subset stop time data that # match day and time selection selected_stops_df = input_stops_df.loc[ input_stops_df['unique_stop_id'].isin( @@ -859,7 +859,7 @@ def _format_transit_net_nodes(df): """ start_time = time.time() - # add unique stop id + # add unique stop ID if 'unique_stop_id' not in df.columns: df['unique_stop_id'] = ( df['stop_id'].str.cat( @@ -880,7 +880,7 @@ def _format_transit_net_nodes(df): col_list.append(item) final_node_df = pd.concat([final_node_df, df[col_list]], axis=1) - # set node index to be unique stop id + # set node index to be unique stop ID final_node_df = final_node_df.set_index('node_id') log('Stop time table transformation to Pandana format node table ' @@ -907,7 +907,7 @@ def _route_type_to_edge(transit_edge_df, stop_time_df): """ start_time = time.time() - # create unique trip ids + # create unique trip IDs stop_time_df['unique_trip_id'] = ( stop_time_df['trip_id'].str.cat( stop_time_df['unique_agency_id'].astype('str'), sep='_')) @@ -933,7 +933,7 @@ def _route_type_to_edge(transit_edge_df, stop_time_df): def _route_id_to_edge(transit_edge_df, trips_df): """ - Append route ids to transit edge table + Append route IDs to transit edge table Parameters ---------- @@ -950,7 +950,7 @@ def _route_id_to_edge(transit_edge_df, trips_df): start_time = time.time() if 'unique_route_id' not in transit_edge_df.columns: - # create unique trip and route ids + # create unique trip and route IDs trips_df['unique_trip_id'] = ( trips_df['trip_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) @@ -962,7 +962,7 @@ def _route_id_to_edge(transit_edge_df, trips_df): transit_edge_df, trips_df[['unique_trip_id', 'unique_route_id']], how='left', on='unique_trip_id', sort=False, copy=False) - log('Route id successfully joined to transit edges. ' + log('Route ID successfully joined to transit edges. ' 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return transit_edge_df_with_routes diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py index d285d38..638a001 100644 --- a/urbanaccess/gtfs/utils_format.py +++ b/urbanaccess/gtfs/utils_format.py @@ -125,7 +125,7 @@ def _read_gtfs_trips(textfile_path, textfile): 'service_id': object, 'route_id': object, 7: object}, low_memory=False) - # 7 is placeholder for shape id which may not exist in some txt files + # 7 is placeholder for shape ID which may not exist in some txt files if len(df) == 0: raise ValueError('{} has no records'.format(os.path.join( textfile_path, textfile))) @@ -249,7 +249,7 @@ def _read_gtfs_calendar_dates(textfile_path, textfile): def _calendar_dates_agencyid(calendar_dates_df, routes_df, trips_df, agency_df, feed_folder): """ - Assign unique agency id to calendar dates dataframe + Assign unique agency ID to calendar dates dataframe Parameters ---------- @@ -312,7 +312,7 @@ def _calendar_dates_agencyid(calendar_dates_df, routes_df, def _calendar_agencyid(calendar_df, routes_df, trips_df, agency_df, feed_folder): """ - Assign unique agency id to calendar dataframe + Assign unique agency ID to calendar dataframe Parameters ---------- @@ -335,7 +335,7 @@ def _calendar_agencyid(calendar_df, routes_df, trips_df, sort=False, copy=False) tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False) - # do another merge to account for service ids that may not be utilized + # do another merge to account for service IDs that may not be utilized # across all GTFS files for accounting purposes so we keep those that # dont show up after merge merged_df = pd.merge(calendar_df[['service_id']], tmp2, how='left', @@ -377,7 +377,7 @@ def _calendar_agencyid(calendar_df, routes_df, trips_df, def _trips_agencyid(trips_df, routes_df, agency_df): """ - Assign unique agency id to trips dataframe + Assign unique agency ID to trips dataframe Parameters ---------- @@ -409,7 +409,7 @@ def _trips_agencyid(trips_df, routes_df, agency_df): def _stops_agencyid(stops_df, trips_df, routes_df, stop_times_df, agency_df, feed_folder): """ - Assign unique agency id to stops dataframe + Assign unique agency ID to stops dataframe Parameters ---------- @@ -475,7 +475,7 @@ def _stops_agencyid(stops_df, trips_df, routes_df, def _routes_agencyid(routes_df, agency_df): """ - Assign unique agency id to routes dataframe + Assign unique agency ID to routes dataframe Parameters ---------- @@ -503,7 +503,7 @@ def _routes_agencyid(routes_df, agency_df): def _stop_times_agencyid(stop_times_df, routes_df, trips_df, agency_df): """ - Assign unique agency id to stop times dataframe + Assign unique agency ID to stop times dataframe Parameters ---------- @@ -541,7 +541,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, trips_df, stop_times_df, calendar_df, feed_folder, calendar_dates_df, nulls_as_folder=True): """ - Create a unique agency id for all gtfs feed dataframes to enable unique + Create an unique agency ID for all gtfs feed dataframes to enable unique relational table keys Parameters @@ -563,8 +563,8 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, calendar_dates_df : pandas:DataFrame calendar dates dataframe nulls_as_folder : bool, optional - if true, gtfs feeds where the agency id is null, the gtfs folder - name will be used as the unique agency id + if true, gtfs feeds where the agency ID is null, the gtfs folder + name will be used as the unique agency ID Returns ------- stops_df, routes_df, trips_df, stop_times_df, calendar_df, @@ -589,7 +589,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, df_list[index] = df log('The agency.txt or agency_id column was not found. The unique ' - 'agency id: {} was generated using the name of the folder ' + 'agency ID: {} was generated using the name of the folder ' 'containing the GTFS feed text files.'.format( unique_agency_id)) @@ -623,7 +623,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, df['unique_agency_id'] = unique_agency_id df_list[index] = df log( - 'The unique agency id: {} was generated using the name of ' + 'The unique agency ID: {} was generated using the name of ' 'the agency in the agency.txt file.'.format( unique_agency_id)) @@ -693,7 +693,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, log( 'agency.txt agency_name column has more than one agency name ' - 'listed. Unique agency id was assigned using the agency id ' + 'listed. Unique agency ID was assigned using the agency ID ' 'and associated agency name.') for index, df in enumerate(df_list): @@ -706,7 +706,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, inplace=True) log( 'There are {:,} null values ({:,.2f}% of {:,} total) without ' - 'a unique agency id. These records will be labeled as ' + 'a unique agency ID. These records will be labeled as ' 'multiple_operators_ with the GTFS file folder ' 'name'.format(df['unique_agency_id'].isnull().sum(), len(df), @@ -718,7 +718,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, if calendar_dates_df.empty: df_list.extend([calendar_dates_df]) - log('Unique agency id operation complete. Took {:,.2f} seconds'.format( + log('Unique agency ID operation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return df_list @@ -727,7 +727,7 @@ def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df, feed_folder, feed_number): """ - Create a unique GTFS feed specific id for all gtfs feed dataframes to + Create an unique GTFS feed specific ID for all gtfs feed dataframes to enable tracking of specific feeds Parameters @@ -776,7 +776,7 @@ def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df, if calendar_dates_df.empty: df_list.extend([calendar_dates_df]) - log('Unique GTFS feed id operation complete. Took {:,.2f} seconds'.format( + log('Unique GTFS feed ID operation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return df_list @@ -1069,7 +1069,7 @@ def _append_route_type(stops_df, stop_times_df, routes_df, def _generate_unique_agency_id(df, col_name): """ - Generate unique agency id + Generate unique agency ID Parameters ---------- @@ -1092,7 +1092,7 @@ def _generate_unique_agency_id(df, col_name): def _generate_unique_feed_id(feed_folder): """ - Generate unique feed id + Generate unique feed ID Parameters ---------- diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 92b4062..53a047d 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -39,7 +39,7 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, 'gtfsfeeds'), yamlname='gtfsfeeds.yaml'): """ - Create a urbanaccess_gtfsfeeds instance from a saved YAML. + Create an urbanaccess_gtfsfeeds instance from a saved YAML. Parameters ---------- @@ -206,7 +206,7 @@ def to_yaml(self, gtfsfeeddir=os.path.join(config.settings.data_folder, yamlname='gtfsfeeds.yaml', overwrite=False): """ - Save a urbanaccess_gtfsfeeds representation to a YAML file. + Save an urbanaccess_gtfsfeeds representation to a YAML file. Parameters ---------- diff --git a/urbanaccess/network.py b/urbanaccess/network.py index b1089bc..1ec6f53 100644 --- a/urbanaccess/network.py +++ b/urbanaccess/network.py @@ -18,7 +18,7 @@ class urbanaccess_network(object): """ - A urbanaccess object of Pandas DataFrames representing + An urbanaccess object of Pandas DataFrames representing the components of a graph network Parameters @@ -301,7 +301,7 @@ def _route_id_to_node(stops_df, edges_w_routes): stops_df : pandas.DataFrame processed gtfs stops DataFrame edges_w_routes : pandas.DataFrame - transit edge DataFrame that has route id information + transit edge DataFrame that has route ID information Returns ------- @@ -310,7 +310,7 @@ def _route_id_to_node(stops_df, edges_w_routes): """ start_time = time.time() - # create unique stop ids + # create unique stop IDs stops_df['unique_stop_id'] = ( stops_df['stop_id'].str.cat( stops_df['unique_agency_id'].astype('str'), sep='_')) @@ -346,7 +346,7 @@ def _route_id_to_node(stops_df, edges_w_routes): transit_nodes_wroutes.drop_duplicates(subset='node_id_route', keep='first', inplace=True) - # set node index to be unique stop id + # set node index to be unique stop ID transit_nodes_wroutes = transit_nodes_wroutes.set_index('node_id_route') log( @@ -370,7 +370,7 @@ def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3): transit nodes DataFrame travel_speed_mph : int, optional travel speed to use to calculate travel time across a - distance on a edge. units are in miles per hour (MPH) + distance on an edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH Returns @@ -422,9 +422,8 @@ def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3): def _format_pandana_edges_nodes(edge_df, node_df): """ Perform final formatting on nodes and edge DataFrames to prepare them - for use in Pandana. - Formatting mainly consists of creating a unique node id and edge from - and to id that is an integer + for use in Pandana. Formatting mainly consists of creating an unique + node ID and edge from and to ID that is an integer per Pandana requirements. Parameters @@ -441,7 +440,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): """ start_time = time.time() - # pandana requires ids that are integer: for nodes - make it the index, + # Pandana requires IDs that are integer: for nodes - make it the index, # for edges make it the from and to columns node_df['id_int'] = range(1, len(node_df) + 1) @@ -461,7 +460,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): try: edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str) # deal with edge cases where typically the name of a street is not - # in a uniform string encoding such as names with accents + # in an uniform string encoding such as names with accents except UnicodeEncodeError: log('Fixed unicode error in {} column'.format(col)) edge_df_wnumericid[col] = edge_df_wnumericid[col].str.encode( @@ -473,7 +472,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): if 'nearest_osm_node' in node_df.columns: node_df.drop(['nearest_osm_node'], axis=1, inplace=True) - log('Edge and node tables formatted for Pandana with integer node ids: ' + log('Edge and node tables formatted for Pandana with integer node IDs: ' 'id_int, to_int, and from_int. Took {:,.2f} seconds'.format( time.time() - start_time)) return edge_df_wnumericid, node_df @@ -483,7 +482,7 @@ def save_network(urbanaccess_network, filename, dir=config.settings.data_folder, overwrite_key=False, overwrite_hdf5=False): """ - Write a urbanaccess_network integrated nodes and edges to a node and edge + Write urbanaccess_network integrated nodes and edges to a node and edge table in a HDF5 file Parameters diff --git a/urbanaccess/osm/network.py b/urbanaccess/osm/network.py index e5f6fd3..3f3779a 100644 --- a/urbanaccess/osm/network.py +++ b/urbanaccess/osm/network.py @@ -18,7 +18,7 @@ def create_osm_net(osm_edges, osm_nodes, osm node dataframe travel_speed_mph : int, optional travel speed to use to calculate travel time across a - distance on a edge. units are in miles per hour (MPH) + distance on an edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH network_type : str, optional default is 'walk' for the osm pedestrian network. diff --git a/urbanaccess/plot.py b/urbanaccess/plot.py index ab24594..7288a40 100644 --- a/urbanaccess/plot.py +++ b/urbanaccess/plot.py @@ -27,9 +27,9 @@ def plot_net(nodes, edges, x_col=None, y_col=None, from_col=None, y_col : str, optional y coordinate column in nodes dataframe from_col : str, optional - name of column to use for 'from' node id + name of column to use for 'from' node ID to_col : str, optional - name of column to use for 'to' node id + name of column to use for 'to' node ID bbox : tuple, optional Bounding box formatted as a 4 element tuple: (lng_max, lat_min, lng_min, lat_max) @@ -248,9 +248,9 @@ def _prep_edges(edges, nodes, from_col, to_col, nodes : pandas.DataFrame edges : pandas.DataFrame from_col : string - name of column to use for 'from' node id + name of column to use for 'from' node ID to_col : string - name of column to use for 'to' node id + name of column to use for 'to' node ID x_col : string name of column to use for 'x' node coordinates y_col : string @@ -259,8 +259,8 @@ def _prep_edges(edges, nodes, from_col, to_col, Returns ------- edges_wline : pandas.DataFrame - the edge dataframe with from and to x y coordinates and - ids to build lines + the edge dataframe with from and to x and y coordinates and + IDs to build lines """ if x_col not in nodes.columns or y_col not in nodes.columns: diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 13bfb50..c847956 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -997,7 +997,7 @@ def test_format_transit_net_edge_test_1(stop_times_interpolated): # length of edge df should be 16 assert len(df) == 16 - # sequence id should be numeric starting at 1 and end at 4 for each trip + # sequence ID should be numeric starting at 1 and end at 4 for each trip assert df['sequence'][0] == 1 and df['sequence'][3] == 4 # edge df should have these columns and no null values @@ -1005,7 +1005,7 @@ def test_format_transit_net_edge_test_1(stop_times_interpolated): assert col in df.columns and df[ col].isnull().values.any() == False # noqa - # there should be 4 edges per trip id + # there should be 4 edges per trip ID for i, row in df.groupby('unique_trip_id').size().iteritems(): assert row == 4 diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index 7626ea7..d34197d 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -181,12 +181,12 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): def df_to_hdf5(data=None, key=None, overwrite_key=False, dir=None, filename=None, overwrite_hdf5=False): """ - Write a pandas dataframe to a table in a HDF5 file + Write a Pandas dataframe to a table in a HDF5 file Parameters ---------- data : pandas.DataFrame - pandas dataframe to save to a HDF5 table + Pandas dataframe to save to a HDF5 table key : string name of table to save dataframe as in the HDF5 file overwrite_key : bool, optional @@ -230,7 +230,7 @@ def df_to_hdf5(data=None, key=None, overwrite_key=False, dir=None, def hdf5_to_df(dir=None, filename=None, key=None): """ - Read data from a HDF5 file to a pandas dataframe + Read data from a HDF5 file to a Pandas dataframe Parameters ---------- @@ -239,7 +239,7 @@ def hdf5_to_df(dir=None, filename=None, key=None): filename : string name of the HDF5 file with .h5 extension to read from key : string - table inside the HDF5 file to return as a pandas dataframe + table inside the HDF5 file to return as a Pandas dataframe Returns ------- From b2b506517e6b625ffccb41fdb79a46599b7cda82 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 1 Apr 2021 17:15:50 -0700 Subject: [PATCH 24/38] pycodestyle fixes and unit test update --- urbanaccess/tests/test_gtfs_network.py | 4 ++-- urbanaccess/tests/test_gtfs_utils_validation.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index c847956..9926f6e 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -726,8 +726,8 @@ def test_skip_interpolator(stop_times, calendar): 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5], - index = range(35), - name = 'departure_time_sec') + index=range(35), + name='departure_time_sec') stop_times['departure_time_sec'] = series df = gtfs_network._interpolate_stop_times(stop_times, calendar) diff --git a/urbanaccess/tests/test_gtfs_utils_validation.py b/urbanaccess/tests/test_gtfs_utils_validation.py index 643dd3b..a43a5ca 100644 --- a/urbanaccess/tests/test_gtfs_utils_validation.py +++ b/urbanaccess/tests/test_gtfs_utils_validation.py @@ -2,6 +2,7 @@ import urbanaccess.gtfs.utils_validation as utils_validation + def test_check_time_range_format(): utils_validation._check_time_range_format(['07:00:00', '10:00:00']) From 92954840dc6136431e8277b2a1073cee569efbd8 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 2 Apr 2021 10:07:25 -0700 Subject: [PATCH 25/38] debug travis, add run time profile to test --- urbanaccess/tests/test_gtfs_network.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 9926f6e..8afc371 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -1,5 +1,6 @@ import pytest import os +import time import glob import pandas as pd import numpy as np @@ -355,6 +356,8 @@ def test_create_transit_net_wo_calendar_dates( sorted(result_edge.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) + print(result_edge.head()) + print(expected_result.head()) assert result_edge.equals(expected_result) @@ -393,6 +396,8 @@ def test_create_transit_net_wo_direction_id( sorted(result_edge.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) + print(result_edge.head()) + print(expected_result.head()) assert result_edge.equals(expected_result) @@ -641,7 +646,11 @@ def test_create_transit_net_save_processed_gtfs_True( def test_interpolator(stop_times, calendar): + # profile run times as _interpolate_stop_times() is a + # function that is critical to have fast run times + start_time = time.time() df = gtfs_network._interpolate_stop_times(stop_times, calendar) + print('Run time: {}'.format(time.time() - start_time)) # unique_trip_id should be generated assert df.loc[1, 'unique_trip_id'] == 'a_citytrains' @@ -1042,6 +1051,8 @@ def test_format_transit_net_edge_test_2( sorted(result.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) + print(result.head()) + print(expected_result.head()) assert result.equals(expected_result) From 566a6bafbb52988f9e34f4df06431d9554f6fb07 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 2 Apr 2021 10:56:18 -0700 Subject: [PATCH 26/38] debug travis --- urbanaccess/tests/test_gtfs_network.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 8afc371..4f15a15 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -358,6 +358,10 @@ def test_create_transit_net_wo_calendar_dates( sorted(expected_result.columns), axis=1) print(result_edge.head()) print(expected_result.head()) + print(result_edge.dtypes) + print(expected_result.dtypes) + print(result_edge['weight'][0]) + print(expected_result['weight'][0]) assert result_edge.equals(expected_result) From 0a7ca7d48fe5a6202ad2ed7e2160f9142b6556be Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 2 Apr 2021 11:27:17 -0700 Subject: [PATCH 27/38] fix travis --- urbanaccess/tests/test_gtfs_network.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 4f15a15..6e01323 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -356,12 +356,9 @@ def test_create_transit_net_wo_calendar_dates( sorted(result_edge.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) - print(result_edge.head()) - print(expected_result.head()) - print(result_edge.dtypes) - print(expected_result.dtypes) - print(result_edge['weight'][0]) - print(expected_result['weight'][0]) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') assert result_edge.equals(expected_result) @@ -400,8 +397,9 @@ def test_create_transit_net_wo_direction_id( sorted(result_edge.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) - print(result_edge.head()) - print(expected_result.head()) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') assert result_edge.equals(expected_result) @@ -1055,8 +1053,9 @@ def test_format_transit_net_edge_test_2( sorted(result.columns), axis=1) expected_result = expected_result.reindex( sorted(expected_result.columns), axis=1) - print(result.head()) - print(expected_result.head()) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result['sequence'] = result['sequence'].astype('int32') assert result.equals(expected_result) From ef3482fee22a8798af1bc0ae5176b83beabc99d4 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 2 Apr 2021 15:34:31 -0700 Subject: [PATCH 28/38] debug travis py3.5 issue --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 7f77eae..1e6244e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ install: - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION pyyaml --file requirements-dev.txt - source activate test-environment - conda info --all + - pip install 'numpy>=1.18' - pip install . - pip list - pip show urbanaccess From f4068f8327a8a0de8974c6d4e881a8c8ea1027c2 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 6 Apr 2021 11:45:21 -0700 Subject: [PATCH 29/38] add time pad functionality, change time selector > to >= and < to <= --- urbanaccess/gtfs/network.py | 46 ++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index ba87c28..24ca777 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -144,7 +144,8 @@ def create_transit_net( selected_interpolated_stop_times_df = _time_selector( df=gtfsfeeds_dfs.stop_times_int, starttime=timerange[0], - endtime=timerange[1]) + endtime=timerange[1], + timerange_pad=timerange_pad) final_edge_table = _format_transit_net_edge( stop_times_df=selected_interpolated_stop_times_df[ @@ -658,7 +659,7 @@ def _time_difference(stop_times_df): return stop_times_df -def _time_selector(df, starttime, endtime): +def _time_selector(df, starttime, endtime, timerange_pad=None): """ Select stop times that fall within a specified time range @@ -669,7 +670,10 @@ def _time_selector(df, starttime, endtime): starttime : str 24 hour clock formatted time 1 endtime : str - 24 hour clock formatted time 2 + 24 hour clock formatted time 2, + timerange_pad: int, optional + integer indicating the number of hours to pad after the end of the + time interval specified in 'endtime' Returns ------- selected_stop_timesdf : pandas.DataFrame @@ -695,19 +699,39 @@ def _time_selector(df, starttime, endtime): end_s = int(str(endtime[6:8])) endtime_sec = (end_h * 60 * 60) + (end_m * 60) + end_s + # define timepad in seconds to include stops active after specified endtime + if timerange_pad: + end_h_wpad = str(end_h + timerange_pad) + end_h_wpad = end_h_wpad.zfill(2) + pad_str = '{}:{}:{}'.format(end_h_wpad, endtime[3:5], endtime[6:8]) + log(' Additional stop times active between the specified end time: ' + '{} with timerange_pad of: {} hour(s) (padded end time: {}) ' + 'will be selected...'.format( + endtime, timerange_pad, pad_str)) + pad = int(0 if timerange_pad is None else timerange_pad) * 3600 + # create df of stops times that are within the requested range selected_stop_timesdf = df[( - (starttime_sec < df["departure_time_sec_interpolate"]) & ( - df["departure_time_sec_interpolate"] < endtime_sec))] + (starttime_sec <= df["departure_time_sec_interpolate"]) & ( + df["departure_time_sec_interpolate"] <= endtime_sec + pad))] subset_df_count = len(selected_stop_timesdf) df_count = len(df) - log('Stop times from {} to {} successfully selected {:,} records out of ' - '{:,} total records ({:.2f} percent of total). ' - 'Took {:,.2f} seconds.'.format( - starttime, endtime, subset_df_count, df_count, - (subset_df_count / df_count) * 100, - time.time() - start_time)) + if timerange_pad: + log('Stop times from {} to {} (with time_pad end time: {}) ' + 'successfully selected {:,} records out of {:,} total records ' + '({:.2f} percent of total). ' + 'Took {:,.2f} seconds.'.format( + starttime, endtime, pad_str, subset_df_count, df_count, + (subset_df_count / df_count) * 100, + time.time() - start_time)) + else: + log('Stop times from {} to {} successfully selected {:,} records ' + 'out of {:,} total records ({:.2f} percent of total). ' + 'Took {:,.2f} seconds.'.format( + starttime, endtime, subset_df_count, df_count, + (subset_df_count / df_count) * 100, + time.time() - start_time)) return selected_stop_timesdf From 60c00a92d4961c41263e9e4f604715866e9f1415 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 6 Apr 2021 11:47:37 -0700 Subject: [PATCH 30/38] add time aware functionality --- urbanaccess/gtfs/network.py | 49 +++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 24ca777..ec3e4e1 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -736,7 +736,7 @@ def _time_selector(df, starttime, endtime, timerange_pad=None): return selected_stop_timesdf -def _format_transit_net_edge(stop_times_df): +def _format_transit_net_edge(stop_times_df, time_aware=False): """ Format transit network data table to match the format required for edges in Pandana graph networks edges @@ -746,6 +746,10 @@ def _format_transit_net_edge(stop_times_df): stop_times_df : pandas.DataFrame interpolated stop times with travel time between stops for the subset time and day + time_aware: bool, optional + boolean to indicate whether the transit network should include + time information. If True, 'arrival_time' and 'departure_time' columns + from the stop_times table will be included in the transit edge table Returns ------- @@ -764,15 +768,38 @@ def _format_transit_net_edge(stop_times_df): inplace=True) for trip, tmp_trip_df in stop_times_df.groupby(['unique_trip_id']): - edge_df = pd.DataFrame({ - "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, - "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, - "weight": tmp_trip_df['timediff'].iloc[1:].values, - "unique_agency_id": tmp_trip_df[ - 'unique_agency_id'].iloc[1:].values, - # set unique trip ID without edge order to join other data later - "unique_trip_id": trip - }) + # if 'time_aware', also create from and to arrival and departure time + # cols + if time_aware: + log(' time_aware is True, adding arrival and departure ' + 'stop times to edges...') + edge_df = pd.DataFrame({ + "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, + "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, + "weight": tmp_trip_df['timediff'].iloc[1:].values, + "unique_agency_id": + tmp_trip_df['unique_agency_id'].iloc[1:].values, + # set unique trip ID without edge order to join other data + # later + "unique_trip_id": trip, + # create from and to arrival and departure time cols + "arrival_from": tmp_trip_df['arrival_time'].iloc[:-1].values, + "arrival_to": tmp_trip_df['arrival_time'].iloc[1:].values, + "departure_from": + tmp_trip_df['departure_time'].iloc[:-1].values, + "departure_to": tmp_trip_df['departure_time'].iloc[1:].values + }) + else: + edge_df = pd.DataFrame({ + "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, + "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, + "weight": tmp_trip_df['timediff'].iloc[1:].values, + "unique_agency_id": + tmp_trip_df['unique_agency_id'].iloc[1:].values, + # set unique trip ID without edge order to join other data + # later + "unique_trip_id": trip + }) # Set current trip ID to edge ID column adding edge order at # end of string @@ -784,6 +811,8 @@ def _format_transit_net_edge(stop_times_df): merged_edge_df = pd.concat(merged_edge, ignore_index=True) merged_edge_df['sequence'] = merged_edge_df['sequence'].astype( int, copy=False) + # create a unique sequential edge ID + # TODO: consider changing col name to 'edge_id' for clarity merged_edge_df['id'] = ( merged_edge_df['unique_trip_id'].str.cat( merged_edge_df['sequence'].astype('str'), sep='_')) From cb401f5fd58a35949f5dfc078afb8bb32bdb3735 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 6 Apr 2021 11:48:25 -0700 Subject: [PATCH 31/38] update create_transit_net() with time pad and aware params --- urbanaccess/gtfs/network.py | 43 ++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index ec3e4e1..c694dc9 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -23,7 +23,9 @@ def create_transit_net( use_existing_stop_times_int=False, save_processed_gtfs=False, save_dir=config.settings.data_folder, - save_filename=None): + save_filename=None, + timerange_pad=None, + time_aware=False): """ Create a travel time weight network graph in units of minutes from GTFS data @@ -69,6 +71,13 @@ def create_transit_net( directory to save the HDF5 file save_filename : str, optional name to save the HDF5 file as + timerange_pad: int, optional + integer indicating the number of hours to pad after the end of the + time interval specified in 'timerange' + time_aware: bool, optional + boolean to indicate whether the transit network should include + time information. If True, 'arrival_time' and 'departure_time' columns + from the stop_times table will be included in the transit edge table Returns ------- @@ -97,6 +106,10 @@ def create_transit_net( raise ValueError('use_existing_stop_times_int must be bool.') if not isinstance(save_processed_gtfs, bool): raise ValueError('save_processed_gtfs must be bool.') + if timerange_pad and not isinstance(timerange_pad, int): + raise ValueError('timerange_pad must be int.') + if not isinstance(time_aware, bool): + raise ValueError('time_aware must be bool.') if overwrite_existing_stop_times_int and use_existing_stop_times_int: raise ValueError('overwrite_existing_stop_times_int and ' 'use_existing_stop_times_int cannot both be True.') @@ -148,9 +161,8 @@ def create_transit_net( timerange_pad=timerange_pad) final_edge_table = _format_transit_net_edge( - stop_times_df=selected_interpolated_stop_times_df[ - ['unique_trip_id', 'stop_id', 'unique_stop_id', 'timediff', - 'stop_sequence', 'unique_agency_id', 'trip_id']]) + stop_times_df=selected_interpolated_stop_times_df, + time_aware=time_aware) transit_edges = _convert_imp_time_units( df=final_edge_table, time_col='weight', convert_to='minutes') @@ -761,18 +773,26 @@ def _format_transit_net_edge(stop_times_df, time_aware=False): log('Starting transformation process for {:,} ' 'total trips...'.format(len(stop_times_df['unique_trip_id'].unique()))) + # subset to only columns needed for processing + cols_of_interest = ['unique_trip_id', 'stop_id', 'unique_stop_id', + 'timediff', 'stop_sequence', 'unique_agency_id', + 'trip_id', 'arrival_time', 'departure_time'] + stop_times_df = stop_times_df[cols_of_interest] + # set columns for new df for data needed by Pandana for edges merged_edge = [] stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], inplace=True) + if time_aware: + log(' time_aware is True, also adding arrival and departure ' + 'stop times to edges...') + for trip, tmp_trip_df in stop_times_df.groupby(['unique_trip_id']): # if 'time_aware', also create from and to arrival and departure time # cols if time_aware: - log(' time_aware is True, adding arrival and departure ' - 'stop times to edges...') edge_df = pd.DataFrame({ "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, @@ -783,11 +803,14 @@ def _format_transit_net_edge(stop_times_df, time_aware=False): # later "unique_trip_id": trip, # create from and to arrival and departure time cols - "arrival_from": tmp_trip_df['arrival_time'].iloc[:-1].values, - "arrival_to": tmp_trip_df['arrival_time'].iloc[1:].values, - "departure_from": + "arrival_time_from": + tmp_trip_df['arrival_time'].iloc[:-1].values, + "arrival_time_to": + tmp_trip_df['arrival_time'].iloc[1:].values, + "departure_time_from": tmp_trip_df['departure_time'].iloc[:-1].values, - "departure_to": tmp_trip_df['departure_time'].iloc[1:].values + "departure_time_to": + tmp_trip_df['departure_time'].iloc[1:].values }) else: edge_df = pd.DataFrame({ From 61c6883172d7e094f8af625553dd85adf910a5fc Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 6 Apr 2021 13:24:16 -0700 Subject: [PATCH 32/38] add unit tests for new functions --- urbanaccess/tests/test_gtfs_network.py | 340 ++++++++++++++++++++++++- 1 file changed, 334 insertions(+), 6 deletions(-) diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 6e01323..df69638 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -89,6 +89,35 @@ def selected_int_stop_times_from_feed_wo_calendar_dates( return stop_times +@pytest.fixture +def selected_int_stop_times_from_feed_wo_calendar_dates_for_timepad( + gtfs_feed_wo_calendar_dates): + stop_times = gtfs_feed_wo_calendar_dates.stop_times.copy() + trip_ids = ['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'c1', 'c2'] + stop_times_subset = stop_times.loc[stop_times['trip_id'].isin(trip_ids)] + stop_times_subset = stop_times_subset.loc[ + ((stop_times_subset['departure_time_sec'] >= 22500) & ( + stop_times_subset['departure_time_sec'] <= 96000)) | ( + stop_times_subset['departure_time_sec'].isnull())] + stop_times_subset['unique_stop_id'] = ( + stop_times_subset['stop_id'].str.cat( + stop_times_subset['unique_agency_id'].astype('str'), sep='_')) + stop_times_subset['departure_time_sec_interpolate'] = stop_times_subset[ + 'departure_time_sec'] + update_dict = {2: 23100, 3: 23400, 26: 23100, 27: 23400, 38: 95100, + 39: 95400} + for index, value in update_dict.items(): + stop_times_subset.at[index, 'departure_time_sec_interpolate'] = value + update_dict = {0: np.nan, 6: np.nan, 11: 600.0, 12: np.nan, 18: np.nan, + 24: np.nan, 30: np.nan, 35: 600.0, 36: np.nan, 42: np.nan, + 47: 600.0} + stop_times_subset['timediff'] = 300.0 + for index, value in update_dict.items(): + stop_times_subset.at[index, 'timediff'] = value + + return stop_times_subset + + @pytest.fixture def selected_stops_from_feed_wo_calendar_dates(gtfs_feed_wo_calendar_dates): # create 'final_selected_stops' df that is used as input to test function @@ -176,7 +205,11 @@ def stop_times_interpolated(): '10_citybuses', '11_citybuses', '12_citybuses', '13_citybuses', '14_citybuses'], 'stop_id': ['10', '11', '12', '13', '14'] * 4, - 'stop_sequence': [1, 2, 3, 4, 5] * 4 + 'stop_sequence': [1, 2, 3, 4, 5] * 4, + 'arrival_time': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00'] * 4, + 'departure_time': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00'] * 4 } index = range(20) @@ -245,6 +278,69 @@ def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2(): return df +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware(): # noqa + # represents df after it has been post-processed downstream + data = { + 'node_id_from': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a'], + 'node_id_to': ['2_agency_a_city_a', '3_agency_a_city_a', + '4_agency_a_city_a', '5_agency_a_city_a', + '6_agency_a_city_a'], + 'weight': [5.0] * 5, + 'unique_agency_id': ['agency_a_city_a'] * 5, + 'unique_trip_id': ['a3_agency_a_city_a'] * 5, + 'sequence': range(1, 6), + 'id': ['a3_agency_a_city_a_1', 'a3_agency_a_city_a_2', + 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', + 'a3_agency_a_city_a_5'], + 'route_type': [3] * 5, + 'arrival_time_from': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00'], + 'arrival_time_to': ['08:20:00', '08:25:00', '08:30:00', '08:35:00', + '08:40:00'], + 'departure_time_from': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00'], + 'departure_time_to': ['08:20:00', '08:25:00', '08:30:00', '08:35:00', + '08:40:00'] + } + index = range(5) + df = pd.DataFrame(data, index) + # raw data are read as int32 + df['sequence'] = df['sequence'].astype('int32') + return df + + +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1_timeaware( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware): # noqa + # represents df prior to being post-processed downstream + df = expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware.copy() # noqa + df.drop(columns=['route_type'], inplace=True) + # convert weight from min to sec to represent df prior to + # post-process step + df['weight'] = 300.0 + + return df + + +@pytest.fixture +def expected_final_transit_edge_from_feed_wo_calendar_dates_timeaware( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware): # noqa + data = { + 'unique_route_id': ['10-101_agency_a_city_a'] * 5, + 'net_type': ['transit'] * 5 + } + index = range(5) + df = pd.DataFrame(data, index) + df = pd.concat( + [expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware, # noqa + df], + axis=1) + return df + + @pytest.fixture def expected_final_transit_edge_from_feed_wo_calendar_dates( expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): @@ -260,6 +356,29 @@ def expected_final_transit_edge_from_feed_wo_calendar_dates( return df +@pytest.fixture +def expected_final_transit_edge_from_feed_wo_calendar_dates_timepad( + expected_final_transit_edge_from_feed_wo_calendar_dates): + # create expected edge table which includes a trip in the reverse + # direction from trip a4 + df = expected_final_transit_edge_from_feed_wo_calendar_dates.copy() + df['node_id_from_2'] = df['node_id_to'] + df['node_id_to_2'] = df['node_id_from'] + df.drop(columns=['node_id_from', 'node_id_to'], inplace=True) + df.rename(columns={'node_id_from_2': 'node_id_from', + 'node_id_to_2': 'node_id_to'}, inplace=True) + df.sort_values(by=['id'], inplace=True, ascending=False) + df['sequence'] = [1, 2, 3, 4, 5] + df['unique_trip_id'] = df['unique_trip_id'].str.slice_replace(0, 2, 'a4') + df['id'] = ( + df['unique_trip_id'].str.cat( + df['sequence'].astype('str'), sep='_')) + df = pd.concat( + [expected_final_transit_edge_from_feed_wo_calendar_dates, df], + axis=0, ignore_index=True) + return df + + @pytest.fixture def expected_transit_node_from_feed_wo_calendar_dates(): data = { @@ -362,6 +481,92 @@ def test_create_transit_net_wo_calendar_dates( assert result_edge.equals(expected_result) +def test_create_transit_net_wo_calendar_dates_timepad( + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates_timepad): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates_timepad.copy() + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None, + timerange_pad=6, + time_aware=False) + assert isinstance(transit_net, urbanaccess_network) + urbanaccess_network_info = vars(transit_net) + expected_dfs = ['transit_nodes', 'transit_edges'] + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) + for key, value in urbanaccess_network_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + result_edge = transit_net.transit_edges.copy() + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') + expected_result['sequence'] = expected_result['sequence'].astype('int32') + assert result_edge.equals(expected_result) + + +def test_create_transit_net_wo_calendar_dates_timeaware( + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates_timeaware): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates_timeaware.copy() # noqa + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None, + timerange_pad=None, + time_aware=True) + assert isinstance(transit_net, urbanaccess_network) + urbanaccess_network_info = vars(transit_net) + expected_dfs = ['transit_nodes', 'transit_edges'] + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) + for key, value in urbanaccess_network_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + result_edge = transit_net.transit_edges.copy() + # check if expected timeware cols are in result + expected_timeaware_cols = ['arrival_time_from', 'arrival_time_to', + 'departure_time_from', 'departure_time_to'] + assert all(col in result_edge.columns for col in expected_timeaware_cols) + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') + expected_result['sequence'] = expected_result['sequence'].astype('int32') + assert result_edge.equals(expected_result) + + def test_create_transit_net_wo_direction_id( gtfs_feed_wo_calendar_dates, expected_urbanaccess_network_keys, @@ -568,6 +773,34 @@ def test_create_transit_net_invalid_params(gtfs_feed_wo_calendar_dates): expected_error = ('overwrite_existing_stop_times_int and ' 'use_existing_stop_times_int cannot both be True.') assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None, + timerange_pad=None, + time_aware=6) + expected_error = "time_aware must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None, + timerange_pad=0.4, + time_aware=False) + expected_error = "timerange_pad must be int." + assert expected_error in str(excinfo.value) def test_create_transit_net_overwrite_stop_times_int_True( @@ -972,7 +1205,8 @@ def test_trip_schedule_selector_w_cal_dates_invalid_params_2( assert expected_error in str(excinfo.value) -def test_time_selector(selected_int_stop_times_from_feed_wo_calendar_dates): +def test_time_selector_wo_timerange_pad( + selected_int_stop_times_from_feed_wo_calendar_dates): timerange = ['08:20:00', '08:35:00'] stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() result = gtfs_network._time_selector( @@ -981,8 +1215,26 @@ def test_time_selector(selected_int_stop_times_from_feed_wo_calendar_dates): endtime=timerange[1]) # create expected subset result - expected_result = stop_times_int.loc[14:15] - assert len(result) == 2 + expected_result = stop_times_int.loc[13:16] + assert len(result) == 4 + assert result.equals(expected_result) + + +def test_time_selector_w_timerange_pad( + selected_int_stop_times_from_feed_wo_calendar_dates_for_timepad): + timerange = ['07:00:00', '10:00:00'] + stop_times_int = \ + selected_int_stop_times_from_feed_wo_calendar_dates_for_timepad.copy() + result = gtfs_network._time_selector( + df=stop_times_int, + starttime=timerange[0], + endtime=timerange[1], + timerange_pad=6) + # create expected subset result + expected_result = stop_times_int.loc[ + stop_times_int.index.isin( + [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])] + assert len(result) == 12 assert result.equals(expected_result) @@ -1002,7 +1254,8 @@ def test_time_difference(selected_int_stop_times_from_feed_wo_calendar_dates): assert result.equals(expected_result) -def test_format_transit_net_edge_test_1(stop_times_interpolated): +def test_format_transit_net_edge_test_1_timeaware_False( + stop_times_interpolated): df = gtfs_network._format_transit_net_edge(stop_times_interpolated) # length of edge df should be 16 @@ -1034,7 +1287,51 @@ def test_format_transit_net_edge_test_1(stop_times_interpolated): 'unique_agency_id'][11] # noqa -def test_format_transit_net_edge_test_2( +def test_format_transit_net_edge_test_1_timeaware_True( + stop_times_interpolated): + df = gtfs_network._format_transit_net_edge(stop_times_interpolated, + time_aware=True) + + # length of edge df should be 16 + assert len(df) == 16 + + # sequence ID should be numeric starting at 1 and end at 4 for each trip + assert df['sequence'][0] == 1 and df['sequence'][3] == 4 + + # edge df should have these columns and no null values + for col in ['node_id_from', 'node_id_to', 'weight']: + assert col in df.columns and df[ + col].isnull().values.any() == False # noqa + + # there should be 4 edges per trip ID + for i, row in df.groupby('unique_trip_id').size().iteritems(): + assert row == 4 + + # check if the values in edge df were obtained from the correct + # positions in the original stop times df + assert df['node_id_from'][0] == stop_times_interpolated[ + 'unique_stop_id'][0] and \ + df['node_id_to'][0] == stop_times_interpolated[ + 'unique_stop_id'][1] and \ + df['weight'][0] == stop_times_interpolated['timediff'][1] # noqa + + assert df['unique_trip_id'][8] == stop_times_interpolated[ + 'unique_trip_id'][11] and \ + df['unique_agency_id'][8] == stop_times_interpolated[ + 'unique_agency_id'][11] # noqa + + assert df['arrival_time_from'][0] == stop_times_interpolated[ + 'arrival_time'][0] and \ + df['arrival_time_to'][0] == stop_times_interpolated[ + 'arrival_time'][1] # noqa + + assert df['departure_time_from'][0] == stop_times_interpolated[ + 'departure_time'][0] and \ + df['departure_time_to'][0] == stop_times_interpolated[ + 'departure_time'][1] # noqa + + +def test_format_transit_net_edge_test_2_timeaware_False( selected_int_stop_times_from_feed_wo_calendar_dates, expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1): expected_result = \ @@ -1059,6 +1356,37 @@ def test_format_transit_net_edge_test_2( assert result.equals(expected_result) +def test_format_transit_net_edge_timeaware_True( + selected_int_stop_times_from_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1_timeaware): # noqa + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1_timeaware.copy() # noqa + + # create the 'selected_interpolated_stop_times_df' that is expected + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + # there are no missing time values in the test data so just use + # 'departure_time_sec' to generate the timediff col for the test + stop_times_int['timediff'] = stop_times_int.groupby('unique_trip_id')[ + 'departure_time_sec'].diff() + result = gtfs_network._format_transit_net_edge(stop_times_int, + time_aware=True) + + # check if expected timeware cols are in result + expected_timeaware_cols = ['arrival_time_from', 'arrival_time_to', + 'departure_time_from', 'departure_time_to'] + assert all(col in result.columns for col in expected_timeaware_cols) + + # test that output df is identical to expected df + result = result.reindex( + sorted(result.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result['sequence'] = result['sequence'].astype('int32') + assert result.equals(expected_result) + + def test_convert_imp_time_units( transit_edge_from_feed_wo_calendar_dates): # test with minutes From cc9a6ce2ee0b82913a4e601bbc0a4aae9107a416 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 23 Apr 2021 10:16:27 -0700 Subject: [PATCH 33/38] remove numpy travis test fix --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1e6244e..7f77eae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,6 @@ install: - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION pyyaml --file requirements-dev.txt - source activate test-environment - conda info --all - - pip install 'numpy>=1.18' - pip install . - pip list - pip show urbanaccess From 01a86a35798c0eab626093dc6a0ec08484310472 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Fri, 23 Apr 2021 12:22:32 -0700 Subject: [PATCH 34/38] keep only departure_time and arrival_time cols of interest --- urbanaccess/gtfs/network.py | 20 ++++++++--------- urbanaccess/tests/test_gtfs_network.py | 31 +++++++++----------------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index c694dc9..5bc0d57 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -78,6 +78,8 @@ def create_transit_net( boolean to indicate whether the transit network should include time information. If True, 'arrival_time' and 'departure_time' columns from the stop_times table will be included in the transit edge table + where 'departure_time' is the departure time at node_id_from stop and + 'arrival_time' is the arrival time at node_id_to stop Returns ------- @@ -762,6 +764,8 @@ def _format_transit_net_edge(stop_times_df, time_aware=False): boolean to indicate whether the transit network should include time information. If True, 'arrival_time' and 'departure_time' columns from the stop_times table will be included in the transit edge table + where 'departure_time' is the departure time at node_id_from stop and + 'arrival_time' is the arrival time at node_id_to stop Returns ------- @@ -790,8 +794,7 @@ def _format_transit_net_edge(stop_times_df, time_aware=False): 'stop times to edges...') for trip, tmp_trip_df in stop_times_df.groupby(['unique_trip_id']): - # if 'time_aware', also create from and to arrival and departure time - # cols + # if 'time_aware', also create arrival and departure time cols if time_aware: edge_df = pd.DataFrame({ "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, @@ -802,15 +805,12 @@ def _format_transit_net_edge(stop_times_df, time_aware=False): # set unique trip ID without edge order to join other data # later "unique_trip_id": trip, - # create from and to arrival and departure time cols - "arrival_time_from": - tmp_trip_df['arrival_time'].iloc[:-1].values, - "arrival_time_to": - tmp_trip_df['arrival_time'].iloc[1:].values, - "departure_time_from": + # departure_time at node_id_from stop + "departure_time": tmp_trip_df['departure_time'].iloc[:-1].values, - "departure_time_to": - tmp_trip_df['departure_time'].iloc[1:].values + # arrival_time at node_id_to stop + "arrival_time": + tmp_trip_df['arrival_time'].iloc[1:].values }) else: edge_df = pd.DataFrame({ diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index df69638..c4dafca 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -296,14 +296,10 @@ def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2_timeaware(): 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', 'a3_agency_a_city_a_5'], 'route_type': [3] * 5, - 'arrival_time_from': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00'], - 'arrival_time_to': ['08:20:00', '08:25:00', '08:30:00', '08:35:00', - '08:40:00'], - 'departure_time_from': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', - '08:35:00'], - 'departure_time_to': ['08:20:00', '08:25:00', '08:30:00', '08:35:00', - '08:40:00'] + 'departure_time': ['08:15:00', '08:20:00', '08:25:00', '08:30:00', + '08:35:00'], + 'arrival_time': ['08:20:00', '08:25:00', '08:30:00', '08:35:00', + '08:40:00'] } index = range(5) df = pd.DataFrame(data, index) @@ -552,8 +548,7 @@ def test_create_transit_net_wo_calendar_dates_timeaware( result_edge = transit_net.transit_edges.copy() # check if expected timeware cols are in result - expected_timeaware_cols = ['arrival_time_from', 'arrival_time_to', - 'departure_time_from', 'departure_time_to'] + expected_timeaware_cols = ['arrival_time', 'departure_time'] assert all(col in result_edge.columns for col in expected_timeaware_cols) # test that output df is identical to expected df result_edge = result_edge.reindex( @@ -1320,15 +1315,10 @@ def test_format_transit_net_edge_test_1_timeaware_True( df['unique_agency_id'][8] == stop_times_interpolated[ 'unique_agency_id'][11] # noqa - assert df['arrival_time_from'][0] == stop_times_interpolated[ - 'arrival_time'][0] and \ - df['arrival_time_to'][0] == stop_times_interpolated[ - 'arrival_time'][1] # noqa - - assert df['departure_time_from'][0] == stop_times_interpolated[ - 'departure_time'][0] and \ - df['departure_time_to'][0] == stop_times_interpolated[ - 'departure_time'][1] # noqa + assert df['departure_time'][0] == stop_times_interpolated[ + 'departure_time'][0] # noqa + assert df['arrival_time'][0] == stop_times_interpolated[ + 'arrival_time'][1] # noqa def test_format_transit_net_edge_test_2_timeaware_False( @@ -1372,8 +1362,7 @@ def test_format_transit_net_edge_timeaware_True( time_aware=True) # check if expected timeware cols are in result - expected_timeaware_cols = ['arrival_time_from', 'arrival_time_to', - 'departure_time_from', 'departure_time_to'] + expected_timeaware_cols = ['arrival_time', 'departure_time'] assert all(col in result.columns for col in expected_timeaware_cols) # test that output df is identical to expected df From 766eb72336f0988d78b49d9626eca332c43104a4 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 27 Apr 2021 11:58:59 -0700 Subject: [PATCH 35/38] fix fiona issue with travis --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index d578087..1cddf67 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,6 +6,7 @@ pycodestyle # testing demo notebook jupyter +fiona<=1.8.18 # fixes issue with travis cartopy # requires conda pyepsg From 2e530de8c4903754f6c30ec9136759a0280ebb92 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 27 Apr 2021 12:00:57 -0700 Subject: [PATCH 36/38] fix formatting --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1cddf67..14654ec 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ pycodestyle # testing demo notebook jupyter -fiona<=1.8.18 # fixes issue with travis +fiona <= 1.8.18 # fixes issue with travis cartopy # requires conda pyepsg From 07540c0d5e75e27808f32d9ca8d15459b983d0f7 Mon Sep 17 00:00:00 2001 From: sablanchard Date: Tue, 18 May 2021 14:35:55 -0700 Subject: [PATCH 37/38] change timerange_pad from int for a hr -> str for a 24 hr clock to support partial hr time pad values --- urbanaccess/gtfs/network.py | 47 +++++++++++++++++--------- urbanaccess/tests/test_gtfs_network.py | 6 ++-- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 5bc0d57..cada8cc 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -2,6 +2,7 @@ import os import pandas as pd import time +from datetime import datetime, timedelta import logging as lg from urbanaccess.utils import log, df_to_hdf5, hdf5_to_df @@ -71,9 +72,11 @@ def create_transit_net( directory to save the HDF5 file save_filename : str, optional name to save the HDF5 file as - timerange_pad: int, optional - integer indicating the number of hours to pad after the end of the - time interval specified in 'timerange' + timerange_pad: str, optional + string indicating the number of hours minutes seconds to pad after the + end of the time interval specified in 'timerange'. Must follow format + of a 24 hour clock for example: '02:00:00' for a two hour pad or + '02:30:00' for a 2 hour and 30 minute pad. time_aware: bool, optional boolean to indicate whether the transit network should include time information. If True, 'arrival_time' and 'departure_time' columns @@ -108,8 +111,8 @@ def create_transit_net( raise ValueError('use_existing_stop_times_int must be bool.') if not isinstance(save_processed_gtfs, bool): raise ValueError('save_processed_gtfs must be bool.') - if timerange_pad and not isinstance(timerange_pad, int): - raise ValueError('timerange_pad must be int.') + if timerange_pad and not isinstance(timerange_pad, str): + raise ValueError('timerange_pad must be string.') if not isinstance(time_aware, bool): raise ValueError('time_aware must be bool.') if overwrite_existing_stop_times_int and use_existing_stop_times_int: @@ -685,9 +688,11 @@ def _time_selector(df, starttime, endtime, timerange_pad=None): 24 hour clock formatted time 1 endtime : str 24 hour clock formatted time 2, - timerange_pad: int, optional - integer indicating the number of hours to pad after the end of the - time interval specified in 'endtime' + timerange_pad: str, optional + string indicating the number of hours minutes seconds to pad after the + end of the time interval specified in 'timerange'. Must follow format + of a 24 hour clock for example: '02:00:00' for a two hour pad or + '02:30:00' for a 2 hour and 30 minute pad. Returns ------- selected_stop_timesdf : pandas.DataFrame @@ -715,14 +720,24 @@ def _time_selector(df, starttime, endtime, timerange_pad=None): # define timepad in seconds to include stops active after specified endtime if timerange_pad: - end_h_wpad = str(end_h + timerange_pad) - end_h_wpad = end_h_wpad.zfill(2) - pad_str = '{}:{}:{}'.format(end_h_wpad, endtime[3:5], endtime[6:8]) + # convert timerange_pad 24 hour to seconds + pad_h = int(str(timerange_pad[0:2])) + pad_m = int(str(timerange_pad[3:5])) + pad_s = int(str(timerange_pad[6:8])) + pad_sec = (pad_h * 60 * 60) + (pad_m * 60) + pad_s + + # add endtime and timerange_pad to get new endtime and convert to + # str for informative print + dt1 = datetime.strptime(endtime, '%H:%M:%S') + dt2 = datetime.strptime(timerange_pad, '%H:%M:%S') + dt2_delta = timedelta(hours=dt2.hour, minutes=dt2.minute, + seconds=dt2.second) + dt3 = dt1 + dt2_delta + str_t3 = datetime.strftime(dt3, '%H:%M:%S') log(' Additional stop times active between the specified end time: ' - '{} with timerange_pad of: {} hour(s) (padded end time: {}) ' - 'will be selected...'.format( - endtime, timerange_pad, pad_str)) - pad = int(0 if timerange_pad is None else timerange_pad) * 3600 + '{} with timerange_pad of: {} (padded end time: {}) ' + 'will be selected...'.format(endtime, timerange_pad, str_t3)) + pad = int(0 if timerange_pad is None else pad_sec) # create df of stops times that are within the requested range selected_stop_timesdf = df[( @@ -736,7 +751,7 @@ def _time_selector(df, starttime, endtime, timerange_pad=None): 'successfully selected {:,} records out of {:,} total records ' '({:.2f} percent of total). ' 'Took {:,.2f} seconds.'.format( - starttime, endtime, pad_str, subset_df_count, df_count, + starttime, endtime, str_t3, subset_df_count, df_count, (subset_df_count / df_count) * 100, time.time() - start_time)) else: diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index c4dafca..ee00924 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -492,7 +492,7 @@ def test_create_transit_net_wo_calendar_dates_timepad( save_processed_gtfs=False, save_dir=None, save_filename=None, - timerange_pad=6, + timerange_pad='06:00:00', time_aware=False) assert isinstance(transit_net, urbanaccess_network) urbanaccess_network_info = vars(transit_net) @@ -794,7 +794,7 @@ def test_create_transit_net_invalid_params(gtfs_feed_wo_calendar_dates): save_filename=None, timerange_pad=0.4, time_aware=False) - expected_error = "timerange_pad must be int." + expected_error = "timerange_pad must be string." assert expected_error in str(excinfo.value) @@ -1224,7 +1224,7 @@ def test_time_selector_w_timerange_pad( df=stop_times_int, starttime=timerange[0], endtime=timerange[1], - timerange_pad=6) + timerange_pad='06:00:00') # create expected subset result expected_result = stop_times_int.loc[ stop_times_int.index.isin( From 35a0d0b3ecdd9bde1ed679dc90e7dd603a08795d Mon Sep 17 00:00:00 2001 From: sablanchard Date: Wed, 19 May 2021 14:18:34 -0700 Subject: [PATCH 38/38] drop py27 and 35 from travis --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1e6244e..859429d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,6 @@ language: python python: - - '2.7' - - '3.5' - '3.6' - '3.7' - '3.8'