Skip to content

Commit

Permalink
Enable all json reader options in pylibcudf read_json (#17563)
Browse files Browse the repository at this point in the history
This PR exposes all json reader options in pylibcudf and enables it via kwargs in `cudf.read_json`
since kwargs cannot be used in cython, kwargs is passed as dict to cython.
These options are hidden in docs intentionally, as these options are mostly used for testing feature requests from spark json reader now. These options are expected to change.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: #17563
  • Loading branch information
karthikeyann authored Dec 16, 2024
1 parent 76b35ad commit e9744b4
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 17 deletions.
6 changes: 1 addition & 5 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,6 @@ def read_json(
if dtype is None:
dtype = True

if kwargs:
raise ValueError(
"cudf engine doesn't support the "
f"following keyword arguments: {list(kwargs.keys())}"
)
if args:
raise ValueError(
"cudf engine doesn't support the "
Expand Down Expand Up @@ -198,6 +193,7 @@ def read_json(
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
recovery_mode=c_on_bad_lines,
extra_parameters=kwargs,
)

df = cudf.DataFrame._from_data(
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ cpdef TableWithMetadata read_json(
bool mixed_types_as_string = *,
bool prune_columns = *,
json_recovery_mode_t recovery_mode = *,
dict extra_parameters = *,
)


Expand Down
41 changes: 40 additions & 1 deletion python/pylibcudf/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ cdef json_reader_options _setup_json_reader_options(
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns,
json_recovery_mode_t recovery_mode):
json_recovery_mode_t recovery_mode,
dict extra_parameters=None):

cdef vector[string] na_vec
cdef vector[data_type] types_vec
cdef json_reader_options opts = (
json_reader_options.builder(source_info.c_obj)
Expand All @@ -81,6 +83,39 @@ cdef json_reader_options _setup_json_reader_options(
opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_prune_columns(prune_columns)

# These hidden options are subjected to change without deprecation cycle.
# These are used to test libcudf JSON reader features, not used in cuDF.
if extra_parameters is not None:
for key, value in extra_parameters.items():
if key == 'delimiter':
opts.set_delimiter(ord(value))
elif key == 'dayfirst':
opts.enable_dayfirst(value)
elif key == 'experimental':
opts.enable_experimental(value)
elif key == 'normalize_single_quotes':
opts.enable_normalize_single_quotes(value)
elif key == 'normalize_whitespace':
opts.enable_normalize_whitespace(value)
elif key == 'strict_validation':
opts.set_strict_validation(value)
elif key == 'allow_unquoted_control_chars':
opts.allow_unquoted_control_chars(value)
elif key == 'allow_numeric_leading_zeros':
opts.allow_numeric_leading_zeros(value)
elif key == 'allow_nonnumeric_numbers':
opts.allow_nonnumeric_numbers(value)
elif key == 'na_values':
for na_val in value:
if isinstance(na_val, str):
na_vec.push_back(na_val.encode())
opts.set_na_values(na_vec)
else:
raise ValueError(
"cudf engine doesn't support the "
f"'{key}' keyword argument for read_json"
)
return opts


Expand Down Expand Up @@ -196,6 +231,7 @@ cpdef TableWithMetadata read_json(
bool mixed_types_as_string = False,
bool prune_columns = False,
json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
dict extra_parameters = None,
):
"""Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
Expand Down Expand Up @@ -227,6 +263,8 @@ cpdef TableWithMetadata read_json(
recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
Whether to raise an error or set corresponding values to null
when encountering an invalid JSON line.
extra_parameters : dict, default None
Additional hidden parameters to pass to the JSON reader.
Returns
-------
Expand All @@ -244,6 +282,7 @@ cpdef TableWithMetadata read_json(
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
recovery_mode=recovery_mode,
extra_parameters=extra_parameters,
)

# Read JSON
Expand Down
71 changes: 60 additions & 11 deletions python/pylibcudf/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t
from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.optional cimport optional
from libcpp.string cimport string
from libcpp.vector cimport vector
from pylibcudf.exception_handler cimport libcudf_exception_handler
Expand All @@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \
cdef struct schema_element:
data_type type
map[string, schema_element] child_types
optional[vector[string]] column_order

cpdef enum class json_recovery_mode_t(int32_t):
FAIL
Expand All @@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \
except +libcudf_exception_handler
size_t get_byte_range_offset() except +libcudf_exception_handler
size_t get_byte_range_size() except +libcudf_exception_handler
size_t get_byte_range_size_with_padding() except +libcudf_exception_handler
size_t get_byte_range_padding() except +libcudf_exception_handler
char get_delimiter() except +libcudf_exception_handler
bool is_enabled_lines() except +libcudf_exception_handler
bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler
bool is_enabled_prune_columns() except +libcudf_exception_handler
bool is_enabled_dayfirst() except +libcudf_exception_handler
bool is_enabled_experimental() except +libcudf_exception_handler
bool is_enabled_dayfirst() except +libcudf_exception_handler
bool is_enabled_keep_quotes() except +libcudf_exception_handler
bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler
bool is_enabled_normalize_whitespace() except +libcudf_exception_handler
json_recovery_mode_t recovery_mode() except +libcudf_exception_handler
bool is_strict_validation() except +libcudf_exception_handler
bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler
bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler
bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler
vector[string] get_na_values() except +libcudf_exception_handler

# setter
void set_dtypes(
vector[data_type] types
) except +libcudf_exception_handler
void set_dtypes(
map[string, schema_element] types
) except +libcudf_exception_handler
void set_compression(
cudf_io_types.compression_type compression
) except +libcudf_exception_handler
void set_dtypes(vector[data_type] types) except +libcudf_exception_handler
void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler
void set_dtypes(map[string, schema_element] types)\
except +libcudf_exception_handler
void set_dtypes(schema_element types) except +libcudf_exception_handler
void set_compression(cudf_io_types.compression_type comp_type)\
except +libcudf_exception_handler
void set_byte_range_offset(size_t offset) except +libcudf_exception_handler
void set_byte_range_size(size_t size) except +libcudf_exception_handler
void set_delimiter(char delimiter) except +libcudf_exception_handler
void enable_lines(bool val) except +libcudf_exception_handler
void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler
void enable_prune_columns(bool val) except +libcudf_exception_handler
void enable_dayfirst(bool val) except +libcudf_exception_handler
void enable_experimental(bool val) except +libcudf_exception_handler
void enable_dayfirst(bool val) except +libcudf_exception_handler
void enable_keep_quotes(bool val) except +libcudf_exception_handler
void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler

void enable_normalize_whitespace(bool val) except +libcudf_exception_handler
void set_recovery_mode(json_recovery_mode_t val)\
except +libcudf_exception_handler
void set_strict_validation(bool val) except +libcudf_exception_handler
void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler
void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler
void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler
void set_na_values(vector[string] vals) except +libcudf_exception_handler

@staticmethod
json_reader_options_builder builder(
Expand All @@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& dtypes(
map[string, schema_element] types
) except +libcudf_exception_handler
json_reader_options_builder& dtypes(
schema_element types
) except +libcudf_exception_handler
json_reader_options_builder& compression(
cudf_io_types.compression_type compression
) except +libcudf_exception_handler
Expand All @@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& byte_range_size(
size_t size
) except +libcudf_exception_handler
json_reader_options_builder& delimiter(
char delimiter
) except +libcudf_exception_handler
json_reader_options_builder& lines(
bool val
) except +libcudf_exception_handler
Expand All @@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& prune_columns(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& experimental(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& dayfirst(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& keep_quotes(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& normalize_single_quotes(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& normalize_whitespace(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& recovery_mode(
json_recovery_mode_t val
) except +libcudf_exception_handler

json_reader_options_builder& strict_validation(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& numeric_leading_zeros(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& nonnumeric_numbers(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& unquoted_control_chars(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& na_values(vector[string] vals)\
except +libcudf_exception_handler

json_reader_options build() except +libcudf_exception_handler

cdef cudf_io_types.table_with_metadata read_json(
Expand Down

0 comments on commit e9744b4

Please sign in to comment.