diff --git a/requirements-dev.txt b/requirements-dev.txt index fbaf9c8..d578087 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ pycodestyle # testing demo notebook jupyter cartopy # requires conda +pyepsg # building documentation numpydoc diff --git a/urbanaccess/config.py b/urbanaccess/config.py index 421458c..cba8f25 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -16,11 +16,12 @@ def _format_check(settings): """ valid_keys = ['data_folder', 'logs_folder', 'log_file', - 'log_console', 'log_name', 'log_filename', 'gtfs_api'] + 'log_console', 'log_name', 'log_filename', + 'txt_encoding', 'gtfs_api'] for key in settings.keys(): if key not in valid_keys: - raise ValueError('{} not found in list of valid configuation ' + raise ValueError('{} not found in list of valid configuration ' 'keys'.format(key)) if not isinstance(key, str): raise ValueError('{} must be a string'.format(key)) @@ -42,13 +43,17 @@ class urbanaccess_config(object): logs_folder : str location to write log files log_file : bool - if true, save log output to a log file in logs_folder + if True, save log output to a log file in logs_folder log_console : bool - if true, print log output to the console + if True, print log output to the console log_name : str name of the logger log_filename : str name of the log file + txt_encoding : str + default text encoding used by the GTFS files, to be passed to + Python's open() function. Must be a valid encoding recognized by + Python codecs. gtfs_api : dict dictionary of the name of the GTFS API service as the key and the GTFS API server root URL as the value to pass to the GTFS loader @@ -61,6 +66,7 @@ def __init__(self, log_console=False, log_name='urbanaccess', log_filename='urbanaccess', + txt_encoding='utf-8', gtfs_api={'gtfsdataexch': ( 'http://www.gtfs-data-exchange.com/' 'api/agencies?format=csv')}): @@ -71,6 +77,7 @@ def __init__(self, self.log_console = log_console self.log_name = log_name self.log_filename = log_filename + self.txt_encoding = txt_encoding self.gtfs_api = gtfs_api @classmethod @@ -110,6 +117,7 @@ def from_yaml(cls, configdir='configs', log_name=yaml_config.get('log_name', 'urbanaccess'), log_filename=yaml_config.get('log_filename', 'urbanaccess'), + txt_encoding=yaml_config.get('txt_encoding', 'utf-8'), gtfs_api=yaml_config.get('gtfs_api', { 'gtfsdataexch': ('http://www.gtfs-data-exchange.com/' @@ -128,6 +136,7 @@ def to_dict(self): 'log_console': self.log_console, 'log_name': self.log_name, 'log_filename': self.log_filename, + 'txt_encoding': self.txt_encoding, 'gtfs_api': self.gtfs_api, } diff --git a/urbanaccess/gtfs/load.py b/urbanaccess/gtfs/load.py index 96cc286..43b88af 100644 --- a/urbanaccess/gtfs/load.py +++ b/urbanaccess/gtfs/load.py @@ -4,6 +4,7 @@ import time import pandas as pd import six +import logging as lg from urbanaccess import config from urbanaccess.utils import log @@ -20,7 +21,7 @@ def _standardize_txt(csv_rootpath=os.path.join(config.settings.data_folder, Parameters ---------- csv_rootpath : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored Returns @@ -59,6 +60,7 @@ def _txt_encoder_check(gtfsfiles_to_use, """ # UnicodeDecodeError start_time = time.time() + log('Checking GTFS text file for encoding issues...') folderlist = [foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername))] @@ -74,14 +76,16 @@ def _txt_encoder_check(gtfsfiles_to_use, for textfile in textfilelist: if textfile in gtfsfiles_to_use: # Read from file - file_open = open(os.path.join(csv_rootpath, folder, textfile)) + file_path = os.path.join(csv_rootpath, folder, textfile) + file_open = open(file_path) raw = file_open.read() file_open.close() if raw.startswith(codecs.BOM_UTF8): + msg = 'Correcting encoding issue in: {}...' + log(msg.format(file_path)) raw = raw.replace(codecs.BOM_UTF8, '', 1) # Write to file - file_open = open( - os.path.join(csv_rootpath, folder, textfile), 'w') + file_open = open(file_path, 'w') file_open.write(raw) file_open.close() @@ -100,9 +104,9 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, Parameters ---------- gtfsfiles_to_use : list - list of gtfs feed txt files to utilize + list of GTFS feed txt files to utilize csv_rootpath : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored Returns @@ -111,6 +115,11 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, """ start_time = time.time() + txt_encoding = config.settings.txt_encoding + msg = ('Checking GTFS text file header whitespace... ' + 'Reading files using encoding: {} set in configuration.') + log(msg.format(txt_encoding)) + folderlist = [foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername))] @@ -124,25 +133,41 @@ def _txt_header_whitespace_check(gtfsfiles_to_use, for textfile in textfilelist: if textfile in gtfsfiles_to_use: + file_path = os.path.join(csv_rootpath, folder, textfile) # Read from file - with open(os.path.join(csv_rootpath, folder, textfile)) as f: - lines = f.readlines() - lines[0] = re.sub(r'\s+', '', lines[0]) + '\n' - # Write to file try: - with open(os.path.join(csv_rootpath, folder, textfile), - 'w') as f: - f.writelines(lines) - except Exception: - log('Unable to read {}. Check that file is not currently' - 'being read or is not already in memory as this is ' - 'likely the cause of the error.' - ''.format(os.path.join(csv_rootpath, - folder, textfile))) - log( - 'GTFS text file header whitespace check completed. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + if six.PY2: + with open(file_path) as f: + lines = f.readlines() + else: + # read with default 'utf-8' encoding + with open( + file_path, + encoding=txt_encoding) as f: + lines = f.readlines() + line_wo_whitespace = re.sub(r'\s+', '', lines[0]) + '\n' + # only write the file if there are changes to be made + if lines[0] != line_wo_whitespace: + msg = 'Removing whitespace from header(s) in: {}...' + log(msg.format(file_path)) + lines[0] = line_wo_whitespace + # Write to file + if six.PY2: + with open( + file_path, 'w') as f: + f.writelines(lines) + else: + # write with default 'utf-8' encoding + with open( + file_path, 'w', + encoding=txt_encoding) as f: + f.writelines(lines) + except Exception as e: + msg = 'Unable to process: {}. Exception: {}' + raise Exception(log(msg.format(file_path, e), + level=lg.ERROR)) + log('GTFS text file header whitespace check completed. ' + 'Took {:,.2f} seconds'.format(time.time() - start_time)) def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, @@ -156,7 +181,7 @@ def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, Parameters ---------- gtfsfeed_path : str, optional - root path where all gtfs feeds that make up a contiguous metropolitan + root path where all GTFS feeds that make up a contiguous metropolitan area are stored validation : bool if true, the validation check on stops checking for stops outside diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index c22142e..39b6725 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -58,7 +58,7 @@ def create_transit_net(gtfsfeeds_dfs, day, DataFrame for the same time period stored in the gtfsfeeds_dfs object it will be used instead of re-calculated save_processed_gtfs : bool, optional - if true, all processed gtfs DataFrames will + if true, all processed GTFS DataFrames will be stored to disk in a hdf5 file save_dir : str, optional directory to save the hdf5 file @@ -216,7 +216,7 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, day in the GTFS calendar calendar_dates_lookup : dict, optional dictionary of the lookup column (key) as a string and corresponding - string (value) a s string or list of strings to use to subset trips + string (value) as string or list of strings to use to subset trips using the calendar_dates DataFrame. Search will be exact. If none, then the calendar_dates DataFrame will not be used to select trips that are not in the calendar DataFrame. Note search will select all diff --git a/urbanaccess/tests/test_gtfs_load.py b/urbanaccess/tests/test_gtfs_load.py index e2b8ad9..f813f27 100644 --- a/urbanaccess/tests/test_gtfs_load.py +++ b/urbanaccess/tests/test_gtfs_load.py @@ -1,5 +1,10 @@ +# coding=utf-8 import pytest import pandas as pd +import os +import six +import codecs +import sys import urbanaccess.gtfs.load as gtfs_load from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df @@ -13,6 +18,92 @@ def expected_urbanaccess_gtfs_df_keys(): return expected_keys.sort() +@pytest.fixture +def test_txt_files(tmpdir): + # test file that does not need to be fixed + do_not_fix_txt = os.path.join(tmpdir.strpath, 'agency.txt') + data = ['name,text\n', ' Circulação , áéíóúüñ¿¡ \n'] + if six.PY2: + with open(do_not_fix_txt, 'w') as f: + f.writelines(data) + else: + with open(do_not_fix_txt, 'w', encoding='utf-8') as f: + f.writelines(data) + + # test file that does need to be fixed + fix_txt = os.path.join(tmpdir.strpath, 'calendar.txt') + data = [' name , text \n', ' Circulação , áéíóúüñ¿¡ \n'] + if six.PY2: + with open(fix_txt, 'w') as f: + f.writelines(data) + else: + with open(fix_txt, 'w', encoding='utf-8') as f: + f.writelines(data) + + fix_txt_wBOM = os.path.join(tmpdir.strpath, 'calendar_dates.txt') + if six.PY2: + data = [codecs.BOM_UTF8, + ' name , text \n', + ' Circulação , áéíóúüñ¿¡ \n'] + with open(fix_txt_wBOM, 'w') as f: + f.writelines(data) + else: + data = [str(codecs.BOM_UTF8), + ' name , text \n', + ' Circulação , áéíóúüñ¿¡ \n'] + with open(fix_txt_wBOM, 'w', encoding='utf-8') as f: + f.writelines(data) + + return tmpdir.strpath, do_not_fix_txt, fix_txt, fix_txt_wBOM + + +@pytest.fixture +def test_txt_files_to_use(): + gtfsfiles_to_use = ['stops.txt', 'routes.txt', 'trips.txt', + 'stop_times.txt', 'calendar.txt', + 'agency.txt', 'calendar_dates.txt'] + return gtfsfiles_to_use + + +def test_txt_standardization(test_txt_files): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._standardize_txt(csv_rootpath=root_dir) + + df = pd.read_csv(fix_txt) + assert list(df.columns) == list(df.columns.str.strip()) + + df = pd.read_csv(fix_txt_wBOM) + assert list(df.columns) == list(df.columns.str.strip()) + + +def test_txt_header_whitespace_check(test_txt_files, test_txt_files_to_use): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._txt_header_whitespace_check( + gtfsfiles_to_use=test_txt_files_to_use, + csv_rootpath=root_dir) + + # only check 'fix_txt' as 'fix_txt_wBOM' would need to be + # fixed by _txt_encoder_check first + df = pd.read_csv(fix_txt) + assert list(df.columns) == list(df.columns.str.strip()) + + +@pytest.mark.skipif( + sys.version_info >= (3, 0), reason="requires python < 3.0") +def test_txt_encoder_check(test_txt_files, test_txt_files_to_use): + root_dir, do_not_fix_txt, fix_txt, fix_txt_wBOM = test_txt_files + + gtfs_load._txt_encoder_check( + gtfsfiles_to_use=test_txt_files_to_use, + csv_rootpath=root_dir) + + with open(fix_txt_wBOM, 'r') as f: + raw = f.read() + assert raw.startswith(codecs.BOM_UTF8) is False + + def test_loadgtfsfeed_to_df_wo_calendar( agency_a_feed_on_disk_wo_calendar, expected_urbanaccess_gtfs_df_keys):