Merge pull request #184 from kbase/develop

Release 1.3.6
kbase · Mar 3, 2023 · f808a07 · f808a07
2 parents 764e7a8 + d172eb6
commit f808a07
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -34,6 +34,13 @@ to run inside docker run /run_in_docker.sh
 
 Included configurations for the Visual Studio Code debugger for python that mirror what is in the entrypoint.sh and testing configuration to run locally in the debugger, set breakpoints and if you open the project in VSCode the debugger should be good to go. The provided configurations can run locally and run tests locally
 
+# development
+
+When releasing a new version:
+
+* Update the release notes
+* Update the version in [staging_service/app.py](staging_service/app.py).VERSION
+
 # expected command line utilities
 to run locally you will need all of these utils on your system: tar, unzip, zip, gzip, bzip2, md5sum, head, tail, wc
 

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,9 @@
+### Version 1.3.6
+- Fixed a bug that would cause NaN and Inf values in xSV to be returned as JSON barewords,
+  which could cause some JSON parsers to fail. They are now returned as strings.
+- Changed the Excel parser to not consider NaN and Inf as missing values to maintain consistency
+  with the xSV parsers
+
 ### Version 1.3.5
 - Fixed a bug that under some circumstances could cause incomplete file metadata to be returned.
 

diff --git a/staging_service/app.py b/staging_service/app.py
@@ -34,7 +34,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 routes = web.RouteTableDef()
-VERSION = "1.3.5"
+VERSION = "1.3.6"
 
 _DATATYPE_MAPPINGS = None
 

diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py
@@ -4,6 +4,7 @@
 
 import csv
 import magic
+import math
 import pandas
 import re
 
@@ -38,6 +39,23 @@
 _MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"}
 
 
+# by default the excel parser treats nan and inf as missing values, which it probably shouldn't.
+# See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+# See https://kbase-jira.atlassian.net/browse/PTV-1866
+_EXCEL_MISSING_VALUES = [
+    "",
+    "#N/A",
+    "#N/A N/A",
+    "#NA",
+    "<NA>",
+    "N/A",
+    "NA",
+    "NULL",
+    "n/a",
+    "null",
+]
+
+
 class _ParseException(Exception):
     pass
 
@@ -102,11 +120,15 @@ def _normalize_xsv(val: str) -> PRIMITIVE_TYPE:
     # Since csv and tsv rows are all parsed as list[str], regardless of the actual type, we
     # 1) strip any whitespace that might be left around the entries
     # 2) convert to numbers if the string represents a number
-    # 3) return None for empty strings, indicating a missing value in the csv
+    # 3) if the number is inf or nan, which isn't representable in JSON, we turn that right back
+    #    to a string. See https://kbase-jira.atlassian.net/browse/PTV-1866
+    # 4) return None for empty strings, indicating a missing value in the csv
     # If there's a non-numerical string left we return that
     val = val.strip()
     try:
         num = float(val)
+        if math.isinf(num) or math.isnan(num):
+            return val
         return int(num) if num.is_integer() else num
     except ValueError:
         return val if val else None
@@ -202,7 +224,7 @@ def _process_excel_row(
 
 def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource
 ) -> (O[str], O[ParseResult]):
-    df = excel.parse(sheet_name=spcsrc.tab)
+    df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False)
     if df.shape[0] < 3:  # might as well not error check headers in sheets with no data
         return (None, None)
     # at this point we know that at least 4 lines are present - expecting the data type header,

diff --git a/tests/import_specifications/test_data/test_nan_inf.xlsx b/tests/import_specifications/test_data/test_nan_inf.xlsx
diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py
@@ -74,6 +74,41 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR
     ))
 
 
+def test_xsv_parse_success_nan_inf(temp_dir: Path):
+    # Test that NaN and +/-Inf values are converted to strings. If they're not converted,
+    # they will be returned from the service as barewords in JSON and will cause errors in
+    # some parsers as the JSON spec does not support NaN and Inf (which it should but...)
+    # See https://kbase-jira.atlassian.net/browse/PTV-1866
+    _xsv_parse_success_nan_inf(temp_dir, ',', parse_csv)
+    _xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv)
+
+
+def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]):
+    s = sep
+    input_ = temp_dir / str(uuid.uuid4())
+    with open(input_, "w") as test_file:
+        test_file.writelines([
+            f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n",
+            f"spec1{s} -Inf{s}   nan   {s} inf\n",
+            f"Spec 1{s} inf{s} Spec 3{s} -inf\n",
+            f"inf {s}   val2   {s}    NaN     {s} 3.2\n",
+            f"Inf {s} val4{s} -inf{s} 8.9\n",
+            f"val5 {s}-Inf{s}{s} nan\n",
+        ])
+
+    res = parser(input_)
+
+    assert res == ParseResults(frozendict(
+        {"some_nan_type": ParseResult(SpecificationSource(input_),
+            tuple([
+                frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}),
+                frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}),
+                frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}),
+            ])
+        )}
+    ))
+
+
 def test_xsv_parse_success_with_numeric_headers(temp_dir: Path):
     """
     Not a use case we expect but good to check numeric headers don't cause an unexpected
@@ -370,6 +405,43 @@ def test_excel_parse_success():
         }))
 
 
+def test_excel_parse_success_nan_inf():
+    """
+    Tests file with nan, inf, and missing values. nan and inf should be treated as strings
+    to maintain consistency with the CSV parser and avoid making JSON parsers choke.
+    See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+    See https://kbase-jira.atlassian.net/browse/PTV-1866
+    """
+
+    ex = _get_test_file("test_nan_inf.xlsx")
+
+    res = parse_excel(ex)
+
+    assert res == ParseResults(frozendict({
+        "nan_type": ParseResult(SpecificationSource(ex, "tab1"), (
+            frozendict({"header1": 1, "header2": None}),
+            frozendict({"header1": 2, "header2": None}),
+            frozendict({"header1": 3, "header2": None}),
+            frozendict({"header1": 4, "header2": "-1.#IND"}),
+            frozendict({"header1": 5, "header2": "-1.#QNAN"}),
+            frozendict({"header1": 6, "header2": "-NaN"}),
+            frozendict({"header1": 7, "header2": "-nan"}),
+            frozendict({"header1": 8, "header2": "1.#IND"}),
+            frozendict({"header1": 9, "header2": "1.#QNAN"}),
+            frozendict({"header1": 10, "header2": None}),
+            frozendict({"header1": 11, "header2": None}),
+            frozendict({"header1": 12, "header2": None}),
+            frozendict({"header1": 13, "header2": None}),
+            frozendict({"header1": 14, "header2": "NaN"}),
+            frozendict({"header1": 15, "header2": None}),
+            frozendict({"header1": 16, "header2": "nan"}),
+            frozendict({"header1": 17, "header2": None}),
+            frozendict({"header1": 18, "header2": None}),
+            frozendict({"header1": 19, "header2": "some stuff"}),
+        )),
+    }))
+
+
 def _excel_parse_fail(
     test_file: str, message: str = None, errors: list[Error] = None, print_res=False
 ):