Skip to content

Commit

Permalink
Merge pull request #184 from kbase/develop
Browse files Browse the repository at this point in the history
Release 1.3.6
  • Loading branch information
MrCreosote authored Mar 3, 2023
2 parents 764e7a8 + d172eb6 commit f808a07
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 3 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ to run inside docker run /run_in_docker.sh

Included configurations for the Visual Studio Code debugger for python that mirror what is in the entrypoint.sh and testing configuration to run locally in the debugger, set breakpoints and if you open the project in VSCode the debugger should be good to go. The provided configurations can run locally and run tests locally

# development

When releasing a new version:

* Update the release notes
* Update the version in [staging_service/app.py](staging_service/app.py).VERSION

# expected command line utilities
to run locally you will need all of these utils on your system: tar, unzip, zip, gzip, bzip2, md5sum, head, tail, wc

Expand Down
6 changes: 6 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
### Version 1.3.6
- Fixed a bug that would cause NaN and Inf values in xSV to be returned as JSON barewords,
which could cause some JSON parsers to fail. They are now returned as strings.
- Changed the Excel parser to not consider NaN and Inf as missing values to maintain consistency
with the xSV parsers

### Version 1.3.5
- Fixed a bug that under some circumstances could cause incomplete file metadata to be returned.

Expand Down
2 changes: 1 addition & 1 deletion staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
routes = web.RouteTableDef()
VERSION = "1.3.5"
VERSION = "1.3.6"

_DATATYPE_MAPPINGS = None

Expand Down
26 changes: 24 additions & 2 deletions staging_service/import_specifications/individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import csv
import magic
import math
import pandas
import re

Expand Down Expand Up @@ -38,6 +39,23 @@
_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"}


# by default the excel parser treats nan and inf as missing values, which it probably shouldn't.
# See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
# See https://kbase-jira.atlassian.net/browse/PTV-1866
_EXCEL_MISSING_VALUES = [
"",
"#N/A",
"#N/A N/A",
"#NA",
"<NA>",
"N/A",
"NA",
"NULL",
"n/a",
"null",
]


class _ParseException(Exception):
pass

Expand Down Expand Up @@ -102,11 +120,15 @@ def _normalize_xsv(val: str) -> PRIMITIVE_TYPE:
# Since csv and tsv rows are all parsed as list[str], regardless of the actual type, we
# 1) strip any whitespace that might be left around the entries
# 2) convert to numbers if the string represents a number
# 3) return None for empty strings, indicating a missing value in the csv
# 3) if the number is inf or nan, which isn't representable in JSON, we turn that right back
# to a string. See https://kbase-jira.atlassian.net/browse/PTV-1866
# 4) return None for empty strings, indicating a missing value in the csv
# If there's a non-numerical string left we return that
val = val.strip()
try:
num = float(val)
if math.isinf(num) or math.isnan(num):
return val
return int(num) if num.is_integer() else num
except ValueError:
return val if val else None
Expand Down Expand Up @@ -202,7 +224,7 @@ def _process_excel_row(

def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource
) -> (O[str], O[ParseResult]):
df = excel.parse(sheet_name=spcsrc.tab)
df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False)
if df.shape[0] < 3: # might as well not error check headers in sheets with no data
return (None, None)
# at this point we know that at least 4 lines are present - expecting the data type header,
Expand Down
Binary file not shown.
72 changes: 72 additions & 0 deletions tests/import_specifications/test_individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,41 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR
))


def test_xsv_parse_success_nan_inf(temp_dir: Path):
# Test that NaN and +/-Inf values are converted to strings. If they're not converted,
# they will be returned from the service as barewords in JSON and will cause errors in
# some parsers as the JSON spec does not support NaN and Inf (which it should but...)
# See https://kbase-jira.atlassian.net/browse/PTV-1866
_xsv_parse_success_nan_inf(temp_dir, ',', parse_csv)
_xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv)


def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]):
s = sep
input_ = temp_dir / str(uuid.uuid4())
with open(input_, "w") as test_file:
test_file.writelines([
f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n",
f"spec1{s} -Inf{s} nan {s} inf\n",
f"Spec 1{s} inf{s} Spec 3{s} -inf\n",
f"inf {s} val2 {s} NaN {s} 3.2\n",
f"Inf {s} val4{s} -inf{s} 8.9\n",
f"val5 {s}-Inf{s}{s} nan\n",
])

res = parser(input_)

assert res == ParseResults(frozendict(
{"some_nan_type": ParseResult(SpecificationSource(input_),
tuple([
frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}),
frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}),
frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}),
])
)}
))


def test_xsv_parse_success_with_numeric_headers(temp_dir: Path):
"""
Not a use case we expect but good to check numeric headers don't cause an unexpected
Expand Down Expand Up @@ -370,6 +405,43 @@ def test_excel_parse_success():
}))


def test_excel_parse_success_nan_inf():
"""
Tests file with nan, inf, and missing values. nan and inf should be treated as strings
to maintain consistency with the CSV parser and avoid making JSON parsers choke.
See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
See https://kbase-jira.atlassian.net/browse/PTV-1866
"""

ex = _get_test_file("test_nan_inf.xlsx")

res = parse_excel(ex)

assert res == ParseResults(frozendict({
"nan_type": ParseResult(SpecificationSource(ex, "tab1"), (
frozendict({"header1": 1, "header2": None}),
frozendict({"header1": 2, "header2": None}),
frozendict({"header1": 3, "header2": None}),
frozendict({"header1": 4, "header2": "-1.#IND"}),
frozendict({"header1": 5, "header2": "-1.#QNAN"}),
frozendict({"header1": 6, "header2": "-NaN"}),
frozendict({"header1": 7, "header2": "-nan"}),
frozendict({"header1": 8, "header2": "1.#IND"}),
frozendict({"header1": 9, "header2": "1.#QNAN"}),
frozendict({"header1": 10, "header2": None}),
frozendict({"header1": 11, "header2": None}),
frozendict({"header1": 12, "header2": None}),
frozendict({"header1": 13, "header2": None}),
frozendict({"header1": 14, "header2": "NaN"}),
frozendict({"header1": 15, "header2": None}),
frozendict({"header1": 16, "header2": "nan"}),
frozendict({"header1": 17, "header2": None}),
frozendict({"header1": 18, "header2": None}),
frozendict({"header1": 19, "header2": "some stuff"}),
)),
}))


def _excel_parse_fail(
test_file: str, message: str = None, errors: list[Error] = None, print_res=False
):
Expand Down

0 comments on commit f808a07

Please sign in to comment.