From 12e6f04a4828bf0eecd6f6c752475fe3da7b8b34 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 11 Mar 2022 08:36:19 -0800 Subject: [PATCH] DATAUP-727: Fix an import spec parse bug re trailing separators (#168) * Fix a import spec parse bug re trailing separators If an xSV file is opened and saved by Excel, it'll add trailing separators to the end of the first header line. This fix ignores those separators. * Bump version, add release notes * Fix unused import * Provide more info if a file isn't an expected type * Add csv file type to list of allowed files Not quite sure why tests were passing everywhere up till now, or why tests pass locally - maybe a `magic` update * Remove redundant file type check doh * Fix tests on mac ... maybe? Tests still don't pass on Boris's machine, this might fix it --- RELEASE_NOTES.md | 5 +++ staging_service/app.py | 2 +- .../individual_parsers.py | 43 ++++++------------- .../test_individual_parsers.py | 10 ++++- 4 files changed, 28 insertions(+), 32 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 9eeab3d1..d5895652 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,8 @@ +### Version 1.3.4 +- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the + first header of a file had trailing separators. This occurs if a csv/tsv file is opened and + saved by Excel. + ### Version 1.3.3 - Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for each empty line in the file. diff --git a/staging_service/app.py b/staging_service/app.py index a4472da5..e082d0f1 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -34,7 +34,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) routes = web.RouteTableDef() -VERSION = "1.3.3" +VERSION = "1.3.4" _DATATYPE_MAPPINGS = None diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index bb15d083..47836a6c 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -10,7 +10,7 @@ # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict from pathlib import Path -from typing import TextIO, Optional as O, Union, Any +from typing import Optional as O, Union, Any from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, @@ -35,7 +35,7 @@ _HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)") -_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty"} +_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} class _ParseException(Exception): @@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version return match[1], int(match[2]) -def _required_next( - input_: Union[TextIO, Any], # Any really means a csv reader object - spec_source: SpecificationSource, - error: str -) -> Union[str, list[str]]: - # returns a string for a TextIO input or a list for a Reader input - try: - return next(input_) - except StopIteration: - raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) - def _csv_next( - input_: Union[TextIO, Any], # Any really means a csv reader object + input_: Any, # Any really means a csv reader object line_number: int, - expected_line_count: int, + expected_line_count: Union[None, int], # None = skip columns check spec_source: SpecificationSource, error: str ) -> list[str]: - line = _required_next(input_, spec_source, error) - if len(line) != expected_line_count: + try: + line = next(input_) + except StopIteration: + raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) + if expected_line_count and len(line) != expected_line_count: raise _ParseException(Error( ErrorType.INCORRECT_COLUMN_COUNT, f"Incorrect number of items in line {line_number}, " @@ -91,15 +83,6 @@ def _csv_next( return line -def _get_datatype(input_: TextIO, spec_source: SpecificationSource, maximum_version: int -) -> tuple[str, int]: - # return is (data type, column count) - return _parse_header( - _required_next(input_, spec_source, "Missing data type / version header").strip(), - spec_source, - maximum_version) - - def _error(error: Error) -> ParseResults: return ParseResults(errors = tuple([error])) @@ -155,11 +138,13 @@ def _normalize_headers( def _parse_xsv(path: Path, sep: str) -> ParseResults: spcsrc = SpecificationSource(path) try: - if magic.from_file(str(path), mime=True) not in _MAGIC_TEXT_FILES: - return _error(Error(ErrorType.PARSE_FAIL, "Not a text file", spcsrc)) + filetype = magic.from_file(str(path), mime=True) + if filetype not in _MAGIC_TEXT_FILES: + return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) with open(path, newline='') as input_: - datatype, columns = _get_datatype(input_, spcsrc, _VERSION) rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting + dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") + datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line") param_ids = _normalize_headers(hd1, 2, spcsrc) _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line") diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index eb3d6c1a..f7caace9 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -39,6 +39,9 @@ def temp_dir() -> Generator[Path, None, None]: def test_xsv_parse_success(temp_dir: Path): + # Tests a special case where if an xSV file is opened by Excel and then resaved, + # Excel will add separators to the end of the 1st header line. This previously caused + # the parse to fail. _xsv_parse_success(temp_dir, ',', parse_csv) _xsv_parse_success(temp_dir, '\t', parse_tsv) @@ -48,7 +51,7 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: test_file.writelines([ - "Data type: some_type; Columns: 4; Version: 1\n", + f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming @@ -166,7 +169,10 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path): res = parse_csv(test_file) assert res == ParseResults(errors=tuple([ - Error(ErrorType.PARSE_FAIL, "Not a text file", source_1=SpecificationSource(test_file)) + Error( + ErrorType.PARSE_FAIL, + "Not a text file: application/vnd.ms-excel", + source_1=SpecificationSource(test_file)) ]))