Skip to content

Commit

Permalink
Implement on_bad_lines in json reader (rapidsai#15834)
Browse files Browse the repository at this point in the history
Fixes: rapidsai#15559 

This PR implements `on_bad_lines` in json reader. When `on_bad_lines="recover"`, bad lines are replaced by `<NA>` values.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: rapidsai#15834
  • Loading branch information
galipremsagar authored May 24, 2024
1 parent 81cadb6 commit d756c37
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 9 deletions.
15 changes: 14 additions & 1 deletion python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport (
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
Expand All @@ -42,14 +43,24 @@ from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
if on_bad_lines.lower() == "error":
return json_recovery_mode_t.FAIL
elif on_bad_lines.lower() == "recover":
return json_recovery_mode_t.RECOVER_WITH_NULL
else:
raise TypeError(f"Invalid parameter for {on_bad_lines=}")


cpdef read_json(object filepaths_or_buffers,
object dtype,
bool lines,
object compression,
object byte_range,
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns):
bool prune_columns,
object on_bad_lines):
"""
Cython function to call into libcudf API, see `read_json`.
Expand Down Expand Up @@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
.lines(c_lines)
.byte_range_offset(c_range_offset)
.byte_range_size(c_range_size)
.recovery_mode(_get_json_recovery_mode(on_bad_lines))
.build()
)
if is_list_like_dtypes:
Expand All @@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers,
opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_prune_columns(prune_columns)

# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \
data_type type
map[string, schema_element] child_types

cdef enum json_recovery_mode_t:
FAIL "cudf::io::json_recovery_mode_t::FAIL"
RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"

cdef cppclass json_reader_options:
json_reader_options() except +
cudf_io_types.source_info get_source() except +
Expand Down Expand Up @@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& keep_quotes(
bool val
) except +
json_reader_options_builder& recovery_mode(
json_recovery_mode_t val
) except +

json_reader_options build() except +

Expand Down
18 changes: 10 additions & 8 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def read_json(
storage_options=None,
mixed_types_as_string=False,
prune_columns=False,
on_bad_lines="error",
*args,
**kwargs,
):
Expand Down Expand Up @@ -94,14 +95,15 @@ def read_json(
filepaths_or_buffers.append(tmp_source)

df = libjson.read_json(
filepaths_or_buffers,
dtype,
lines,
compression,
byte_range,
keep_quotes,
mixed_types_as_string,
prune_columns,
filepaths_or_buffers=filepaths_or_buffers,
dtype=dtype,
lines=lines,
compression=compression,
byte_range=byte_range,
keep_quotes=keep_quotes,
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
on_bad_lines=on_bad_lines,
)
else:
warnings.warn(
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string):
orient="records",
lines=True,
)


@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"])
def test_json_reader_on_bad_lines(on_bad_lines):
json_input = StringIO(
'{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
)
if on_bad_lines == "error":
with pytest.raises(RuntimeError):
cudf.read_json(
json_input,
lines=True,
orient="records",
on_bad_lines=on_bad_lines,
)
elif on_bad_lines == "recover":
actual = cudf.read_json(
json_input, lines=True, orient="records", on_bad_lines=on_bad_lines
)
expected = cudf.DataFrame(
{"a": [1, 2, None, 3], "b": [10, 11, None, 12]}
)
assert_eq(actual, expected)
else:
with pytest.raises(TypeError):
cudf.read_json(
json_input,
lines=True,
orient="records",
on_bad_lines=on_bad_lines,
)
5 changes: 5 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,11 @@
If True, only return those columns mentioned in the dtype argument.
If `False` dtype argument is used a type inference suggestion.
on_bad_lines : {'error', 'recover'}, default 'error'
Specifies what to do upon encountering a bad line. Allowed values are :
- ``'error'``, raise an Exception when a bad line is encountered.
- ``'recover'``, fills the row with <NA> when a bad line is encountered.
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Expand Down

0 comments on commit d756c37

Please sign in to comment.