diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 242727163ee..a8fef907bad 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport ( from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.json cimport ( json_reader_options, + json_recovery_mode_t, json_writer_options, read_json as libcudf_read_json, schema_element, @@ -42,6 +43,15 @@ from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): + if on_bad_lines.lower() == "error": + return json_recovery_mode_t.FAIL + elif on_bad_lines.lower() == "recover": + return json_recovery_mode_t.RECOVER_WITH_NULL + else: + raise TypeError(f"Invalid parameter for {on_bad_lines=}") + + cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, @@ -49,7 +59,8 @@ cpdef read_json(object filepaths_or_buffers, object byte_range, bool keep_quotes, bool mixed_types_as_string, - bool prune_columns): + bool prune_columns, + object on_bad_lines): """ Cython function to call into libcudf API, see `read_json`. @@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .recovery_mode(_get_json_recovery_mode(on_bad_lines)) .build() ) if is_list_like_dtypes: @@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 10e43467d57..2e50cccd132 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \ data_type type map[string, schema_element] child_types + cdef enum json_recovery_mode_t: + FAIL "cudf::io::json_recovery_mode_t::FAIL" + RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL" + cdef cppclass json_reader_options: json_reader_options() except + cudf_io_types.source_info get_source() except + @@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& keep_quotes( bool val ) except + + json_reader_options_builder& recovery_mode( + json_recovery_mode_t val + ) except + json_reader_options build() except + diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 7de9705e4cb..dd4a0d9eb07 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -27,6 +27,7 @@ def read_json( storage_options=None, mixed_types_as_string=False, prune_columns=False, + on_bad_lines="error", *args, **kwargs, ): @@ -94,14 +95,15 @@ def read_json( filepaths_or_buffers.append(tmp_source) df = libjson.read_json( - filepaths_or_buffers, - dtype, - lines, - compression, - byte_range, - keep_quotes, - mixed_types_as_string, - prune_columns, + filepaths_or_buffers=filepaths_or_buffers, + dtype=dtype, + lines=lines, + compression=compression, + byte_range=byte_range, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + on_bad_lines=on_bad_lines, ) else: warnings.warn( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 51287fe26a0..ba6a8f94719 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string): orient="records", lines=True, ) + + +@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) +def test_json_reader_on_bad_lines(on_bad_lines): + json_input = StringIO( + '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + ) + if on_bad_lines == "error": + with pytest.raises(RuntimeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) + elif on_bad_lines == "recover": + actual = cudf.read_json( + json_input, lines=True, orient="records", on_bad_lines=on_bad_lines + ) + expected = cudf.DataFrame( + {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} + ) + assert_eq(actual, expected) + else: + with pytest.raises(TypeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1366a0b8e84..0209c692935 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -739,6 +739,11 @@ If True, only return those columns mentioned in the dtype argument. If `False` dtype argument is used a type inference suggestion. +on_bad_lines : {'error', 'recover'}, default 'error' + Specifies what to do upon encountering a bad line. Allowed values are : + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'recover'``, fills the row with when a bad line is encountered. Returns ------- result : Series or DataFrame, depending on the value of `typ`.