diff --git a/README.md b/README.md index fbc9b71e..c85dc418 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,13 @@ to run inside docker run /run_in_docker.sh Included configurations for the Visual Studio Code debugger for python that mirror what is in the entrypoint.sh and testing configuration to run locally in the debugger, set breakpoints and if you open the project in VSCode the debugger should be good to go. The provided configurations can run locally and run tests locally +# development + +When releasing a new version: + +* Update the release notes +* Update the version in [staging_service/app.py](staging_service/app.py).VERSION + # expected command line utilities to run locally you will need all of these utils on your system: tar, unzip, zip, gzip, bzip2, md5sum, head, tail, wc diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 19747130..de917173 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,9 @@ +### Version 1.3.6 +- Fixed a bug that would cause NaN and Inf values in xSV to be returned as JSON barewords, + which could cause some JSON parsers to fail. They are now returned as strings. +- Changed the Excel parser to not consider NaN and Inf as missing values to maintain consistency + with the xSV parsers + ### Version 1.3.5 - Fixed a bug that under some circumstances could cause incomplete file metadata to be returned. diff --git a/staging_service/app.py b/staging_service/app.py index 25dda8eb..ac04fc16 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -34,7 +34,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) routes = web.RouteTableDef() -VERSION = "1.3.5" +VERSION = "1.3.6" _DATATYPE_MAPPINGS = None diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 47836a6c..61bf84b5 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -4,6 +4,7 @@ import csv import magic +import math import pandas import re @@ -38,6 +39,23 @@ _MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} +# by default the excel parser treats nan and inf as missing values, which it probably shouldn't. +# See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html +# See https://kbase-jira.atlassian.net/browse/PTV-1866 +_EXCEL_MISSING_VALUES = [ + "", + "#N/A", + "#N/A N/A", + "#NA", + "", + "N/A", + "NA", + "NULL", + "n/a", + "null", +] + + class _ParseException(Exception): pass @@ -102,11 +120,15 @@ def _normalize_xsv(val: str) -> PRIMITIVE_TYPE: # Since csv and tsv rows are all parsed as list[str], regardless of the actual type, we # 1) strip any whitespace that might be left around the entries # 2) convert to numbers if the string represents a number - # 3) return None for empty strings, indicating a missing value in the csv + # 3) if the number is inf or nan, which isn't representable in JSON, we turn that right back + # to a string. See https://kbase-jira.atlassian.net/browse/PTV-1866 + # 4) return None for empty strings, indicating a missing value in the csv # If there's a non-numerical string left we return that val = val.strip() try: num = float(val) + if math.isinf(num) or math.isnan(num): + return val return int(num) if num.is_integer() else num except ValueError: return val if val else None @@ -202,7 +224,7 @@ def _process_excel_row( def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource ) -> (O[str], O[ParseResult]): - df = excel.parse(sheet_name=spcsrc.tab) + df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, diff --git a/tests/import_specifications/test_data/test_nan_inf.xlsx b/tests/import_specifications/test_data/test_nan_inf.xlsx new file mode 100644 index 00000000..45ee8c63 Binary files /dev/null and b/tests/import_specifications/test_data/test_nan_inf.xlsx differ diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index f7caace9..fe73a309 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -74,6 +74,41 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR )) +def test_xsv_parse_success_nan_inf(temp_dir: Path): + # Test that NaN and +/-Inf values are converted to strings. If they're not converted, + # they will be returned from the service as barewords in JSON and will cause errors in + # some parsers as the JSON spec does not support NaN and Inf (which it should but...) + # See https://kbase-jira.atlassian.net/browse/PTV-1866 + _xsv_parse_success_nan_inf(temp_dir, ',', parse_csv) + _xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv) + + +def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): + s = sep + input_ = temp_dir / str(uuid.uuid4()) + with open(input_, "w") as test_file: + test_file.writelines([ + f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} -Inf{s} nan {s} inf\n", + f"Spec 1{s} inf{s} Spec 3{s} -inf\n", + f"inf {s} val2 {s} NaN {s} 3.2\n", + f"Inf {s} val4{s} -inf{s} 8.9\n", + f"val5 {s}-Inf{s}{s} nan\n", + ]) + + res = parser(input_) + + assert res == ParseResults(frozendict( + {"some_nan_type": ParseResult(SpecificationSource(input_), + tuple([ + frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}), + frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}), + frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}), + ]) + )} + )) + + def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): """ Not a use case we expect but good to check numeric headers don't cause an unexpected @@ -370,6 +405,43 @@ def test_excel_parse_success(): })) +def test_excel_parse_success_nan_inf(): + """ + Tests file with nan, inf, and missing values. nan and inf should be treated as strings + to maintain consistency with the CSV parser and avoid making JSON parsers choke. + See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html + See https://kbase-jira.atlassian.net/browse/PTV-1866 + """ + + ex = _get_test_file("test_nan_inf.xlsx") + + res = parse_excel(ex) + + assert res == ParseResults(frozendict({ + "nan_type": ParseResult(SpecificationSource(ex, "tab1"), ( + frozendict({"header1": 1, "header2": None}), + frozendict({"header1": 2, "header2": None}), + frozendict({"header1": 3, "header2": None}), + frozendict({"header1": 4, "header2": "-1.#IND"}), + frozendict({"header1": 5, "header2": "-1.#QNAN"}), + frozendict({"header1": 6, "header2": "-NaN"}), + frozendict({"header1": 7, "header2": "-nan"}), + frozendict({"header1": 8, "header2": "1.#IND"}), + frozendict({"header1": 9, "header2": "1.#QNAN"}), + frozendict({"header1": 10, "header2": None}), + frozendict({"header1": 11, "header2": None}), + frozendict({"header1": 12, "header2": None}), + frozendict({"header1": 13, "header2": None}), + frozendict({"header1": 14, "header2": "NaN"}), + frozendict({"header1": 15, "header2": None}), + frozendict({"header1": 16, "header2": "nan"}), + frozendict({"header1": 17, "header2": None}), + frozendict({"header1": 18, "header2": None}), + frozendict({"header1": 19, "header2": "some stuff"}), + )), + })) + + def _excel_parse_fail( test_file: str, message: str = None, errors: list[Error] = None, print_res=False ):