Implement on_bad_lines in json reader (rapidsai#15834)

Fixes: rapidsai#15559 This PR implements `on_bad_lines` in json reader. When `on_bad_lines="recover"`, bad lines are replaced by `<NA>` values. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: rapidsai#15834
madsbk · May 24, 2024 · d756c37 · d756c37
1 parent 81cadb6
commit d756c37
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 9 deletions.
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
@@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport (
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
+    json_recovery_mode_t,
     json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
@@ -42,14 +43,24 @@ from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 
+cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
+    if on_bad_lines.lower() == "error":
+        return json_recovery_mode_t.FAIL
+    elif on_bad_lines.lower() == "recover":
+        return json_recovery_mode_t.RECOVER_WITH_NULL
+    else:
+        raise TypeError(f"Invalid parameter for {on_bad_lines=}")
+
+
 cpdef read_json(object filepaths_or_buffers,
                 object dtype,
                 bool lines,
                 object compression,
                 object byte_range,
                 bool keep_quotes,
                 bool mixed_types_as_string,
-                bool prune_columns):
+                bool prune_columns,
+                object on_bad_lines):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
         .lines(c_lines)
         .byte_range_offset(c_range_offset)
         .byte_range_size(c_range_size)
+        .recovery_mode(_get_json_recovery_mode(on_bad_lines))
         .build()
     )
     if is_list_like_dtypes:
@@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers,
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
     opts.enable_prune_columns(prune_columns)
+
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \
         data_type type
         map[string, schema_element] child_types
 
+    cdef enum json_recovery_mode_t:
+        FAIL "cudf::io::json_recovery_mode_t::FAIL"
+        RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"
+
     cdef cppclass json_reader_options:
         json_reader_options() except +
         cudf_io_types.source_info get_source() except +
@@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& keep_quotes(
             bool val
         ) except +
+        json_reader_options_builder& recovery_mode(
+            json_recovery_mode_t val
+        ) except +
 
         json_reader_options build() except +
 

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -27,6 +27,7 @@ def read_json(
     storage_options=None,
     mixed_types_as_string=False,
     prune_columns=False,
+    on_bad_lines="error",
     *args,
     **kwargs,
 ):
@@ -94,14 +95,15 @@ def read_json(
                 filepaths_or_buffers.append(tmp_source)
 
         df = libjson.read_json(
-            filepaths_or_buffers,
-            dtype,
-            lines,
-            compression,
-            byte_range,
-            keep_quotes,
-            mixed_types_as_string,
-            prune_columns,
+            filepaths_or_buffers=filepaths_or_buffers,
+            dtype=dtype,
+            lines=lines,
+            compression=compression,
+            byte_range=byte_range,
+            keep_quotes=keep_quotes,
+            mixed_types_as_string=mixed_types_as_string,
+            prune_columns=prune_columns,
+            on_bad_lines=on_bad_lines,
         )
     else:
         warnings.warn(

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string):
             orient="records",
             lines=True,
         )
+
+
+@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"])
+def test_json_reader_on_bad_lines(on_bad_lines):
+    json_input = StringIO(
+        '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    )
+    if on_bad_lines == "error":
+        with pytest.raises(RuntimeError):
+            cudf.read_json(
+                json_input,
+                lines=True,
+                orient="records",
+                on_bad_lines=on_bad_lines,
+            )
+    elif on_bad_lines == "recover":
+        actual = cudf.read_json(
+            json_input, lines=True, orient="records", on_bad_lines=on_bad_lines
+        )
+        expected = cudf.DataFrame(
+            {"a": [1, 2, None, 3], "b": [10, 11, None, 12]}
+        )
+        assert_eq(actual, expected)
+    else:
+        with pytest.raises(TypeError):
+            cudf.read_json(
+                json_input,
+                lines=True,
+                orient="records",
+                on_bad_lines=on_bad_lines,
+            )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -739,6 +739,11 @@
 
     If True, only return those columns mentioned in the dtype argument.
     If `False` dtype argument is used a type inference suggestion.
+on_bad_lines : {'error', 'recover'}, default 'error'
+    Specifies what to do upon encountering a bad line. Allowed values are :
+
+    - ``'error'``, raise an Exception when a bad line is encountered.
+    - ``'recover'``, fills the row with <NA> when a bad line is encountered.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.