Skip to content

Commit

Permalink
DATAUP-727: Fix an import spec parse bug re trailing separators (#168)
Browse files Browse the repository at this point in the history
* Fix a import spec parse bug re trailing separators

If an xSV file is opened and saved by Excel, it'll add trailing separators to
the end of the first header line. This fix ignores those separators.

* Bump version, add release notes

* Fix unused import

* Provide more info if a file isn't an expected type

* Add csv file type to list of allowed files

Not quite sure why tests were passing everywhere up till now, or why
tests pass locally - maybe a `magic` update

* Remove redundant file type check

doh

* Fix tests on mac

... maybe? Tests still don't pass on Boris's machine, this might fix it
  • Loading branch information
MrCreosote authored Mar 11, 2022
1 parent f0777a4 commit 12e6f04
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 32 deletions.
5 changes: 5 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
### Version 1.3.4
- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the
first header of a file had trailing separators. This occurs if a csv/tsv file is opened and
saved by Excel.

### Version 1.3.3
- Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for
each empty line in the file.
Expand Down
2 changes: 1 addition & 1 deletion staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
routes = web.RouteTableDef()
VERSION = "1.3.3"
VERSION = "1.3.4"

_DATATYPE_MAPPINGS = None

Expand Down
43 changes: 14 additions & 29 deletions staging_service/import_specifications/individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26
from frozendict.core import frozendict
from pathlib import Path
from typing import TextIO, Optional as O, Union, Any
from typing import Optional as O, Union, Any

from staging_service.import_specifications.file_parser import (
PRIMITIVE_TYPE,
Expand All @@ -35,7 +35,7 @@
_HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} "
+ f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)")

_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty"}
_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"}


class _ParseException(Exception):
Expand Down Expand Up @@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version
return match[1], int(match[2])


def _required_next(
input_: Union[TextIO, Any], # Any really means a csv reader object
spec_source: SpecificationSource,
error: str
) -> Union[str, list[str]]:
# returns a string for a TextIO input or a list for a Reader input
try:
return next(input_)
except StopIteration:
raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))

def _csv_next(
input_: Union[TextIO, Any], # Any really means a csv reader object
input_: Any, # Any really means a csv reader object
line_number: int,
expected_line_count: int,
expected_line_count: Union[None, int], # None = skip columns check
spec_source: SpecificationSource,
error: str
) -> list[str]:
line = _required_next(input_, spec_source, error)
if len(line) != expected_line_count:
try:
line = next(input_)
except StopIteration:
raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
if expected_line_count and len(line) != expected_line_count:
raise _ParseException(Error(
ErrorType.INCORRECT_COLUMN_COUNT,
f"Incorrect number of items in line {line_number}, "
Expand All @@ -91,15 +83,6 @@ def _csv_next(
return line


def _get_datatype(input_: TextIO, spec_source: SpecificationSource, maximum_version: int
) -> tuple[str, int]:
# return is (data type, column count)
return _parse_header(
_required_next(input_, spec_source, "Missing data type / version header").strip(),
spec_source,
maximum_version)


def _error(error: Error) -> ParseResults:
return ParseResults(errors = tuple([error]))

Expand Down Expand Up @@ -155,11 +138,13 @@ def _normalize_headers(
def _parse_xsv(path: Path, sep: str) -> ParseResults:
spcsrc = SpecificationSource(path)
try:
if magic.from_file(str(path), mime=True) not in _MAGIC_TEXT_FILES:
return _error(Error(ErrorType.PARSE_FAIL, "Not a text file", spcsrc))
filetype = magic.from_file(str(path), mime=True)
if filetype not in _MAGIC_TEXT_FILES:
return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc))
with open(path, newline='') as input_:
datatype, columns = _get_datatype(input_, spcsrc, _VERSION)
rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting
dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header")
datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION)
hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line")
param_ids = _normalize_headers(hd1, 2, spcsrc)
_csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line")
Expand Down
10 changes: 8 additions & 2 deletions tests/import_specifications/test_individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def temp_dir() -> Generator[Path, None, None]:


def test_xsv_parse_success(temp_dir: Path):
# Tests a special case where if an xSV file is opened by Excel and then resaved,
# Excel will add separators to the end of the 1st header line. This previously caused
# the parse to fail.
_xsv_parse_success(temp_dir, ',', parse_csv)
_xsv_parse_success(temp_dir, '\t', parse_tsv)

Expand All @@ -48,7 +51,7 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR
input_ = temp_dir / str(uuid.uuid4())
with open(input_, "w") as test_file:
test_file.writelines([
"Data type: some_type; Columns: 4; Version: 1\n",
f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n",
f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming
f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming
Expand Down Expand Up @@ -166,7 +169,10 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path):
res = parse_csv(test_file)

assert res == ParseResults(errors=tuple([
Error(ErrorType.PARSE_FAIL, "Not a text file", source_1=SpecificationSource(test_file))
Error(
ErrorType.PARSE_FAIL,
"Not a text file: application/vnd.ms-excel",
source_1=SpecificationSource(test_file))
]))


Expand Down

0 comments on commit 12e6f04

Please sign in to comment.