Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable all json reader options in pylibcudf read_json #17563

Open
wants to merge 6 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,6 @@ def read_json(
if dtype is None:
dtype = True

if kwargs:
raise ValueError(
"cudf engine doesn't support the "
f"following keyword arguments: {list(kwargs.keys())}"
)
if args:
raise ValueError(
"cudf engine doesn't support the "
Expand Down Expand Up @@ -198,6 +193,7 @@ def read_json(
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
recovery_mode=c_on_bad_lines,
extra_parameters=kwargs,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just an FYI. I'm updating this API to use json reader options classes to match the other IO functions (eg. https://github.com/rapidsai/cudf/blob/branch-25.02/python/pylibcudf/pylibcudf/io/parquet.pyx#L309). So it will look like

plc.io.json.read_json(
    plc.io.json.JsonReaderOptions.builder(
        plc.io.SourceInfo(file_paths_or_buffers)
    )
    .byte_range_size(...)
    ...
)

Copy link
Contributor Author

@karthikeyann karthikeyann Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These options will change because most of options are for spark, and they will change until all spark json feature requests are completed. These options are not intended for python users. It's exposed for quicker testing.

)

df = cudf.DataFrame._from_data(
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ cpdef TableWithMetadata read_json(
bool mixed_types_as_string = *,
bool prune_columns = *,
json_recovery_mode_t recovery_mode = *,
dict extra_parameters = *,
)


Expand Down
38 changes: 37 additions & 1 deletion python/pylibcudf/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ cdef json_reader_options _setup_json_reader_options(
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns,
json_recovery_mode_t recovery_mode):
json_recovery_mode_t recovery_mode,
dict extra_parameters=None):

cdef vector[string] na_vec
cdef vector[data_type] types_vec
cdef json_reader_options opts = (
json_reader_options.builder(source_info.c_obj)
Expand All @@ -81,6 +83,36 @@ cdef json_reader_options _setup_json_reader_options(
opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_prune_columns(prune_columns)
if extra_parameters is not None:
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
for key, value in extra_parameters.items():
if key == 'delimiter':
opts.set_delimiter(ord(value))
elif key == 'dayfirst':
opts.enable_dayfirst(value)
elif key == 'experimental':
opts.enable_experimental(value)
elif key == 'normalize_single_quotes':
opts.enable_normalize_single_quotes(value)
elif key == 'normalize_whitespace':
opts.enable_normalize_whitespace(value)
elif key == 'strict_validation':
opts.set_strict_validation(value)
elif key == 'allow_unquoted_control_chars':
opts.allow_unquoted_control_chars(value)
elif key == 'allow_numeric_leading_zeros':
opts.allow_numeric_leading_zeros(value)
elif key == 'allow_nonnumeric_numbers':
opts.allow_nonnumeric_numbers(value)
elif key == 'na_values':
for na_val in value:
if isinstance(na_val, str):
na_vec.push_back(na_val.encode())
opts.set_na_values(na_vec)
else:
raise ValueError(
"cudf engine doesn't support the "
f"'{key}' keyword argument for read_json"
)
return opts


Expand Down Expand Up @@ -196,6 +228,7 @@ cpdef TableWithMetadata read_json(
bool mixed_types_as_string = False,
bool prune_columns = False,
json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
dict extra_parameters = None,
):
"""Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.

Expand Down Expand Up @@ -227,6 +260,8 @@ cpdef TableWithMetadata read_json(
recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
Whether to raise an error or set corresponding values to null
when encountering an invalid JSON line.
extra_parameters : dict, default None
Additional hidden parameters to pass to the JSON reader.

Returns
-------
Expand All @@ -244,6 +279,7 @@ cpdef TableWithMetadata read_json(
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
recovery_mode=recovery_mode,
extra_parameters=extra_parameters,
)

# Read JSON
Expand Down
71 changes: 60 additions & 11 deletions python/pylibcudf/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t
from libcpp cimport bool
from libcpp.map cimport map
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.optional cimport optional
from libcpp.string cimport string
from libcpp.vector cimport vector
from pylibcudf.exception_handler cimport libcudf_exception_handler
Expand All @@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \
cdef struct schema_element:
data_type type
map[string, schema_element] child_types
optional[vector[string]] column_order

cpdef enum class json_recovery_mode_t(int32_t):
FAIL
Expand All @@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \
except +libcudf_exception_handler
size_t get_byte_range_offset() except +libcudf_exception_handler
size_t get_byte_range_size() except +libcudf_exception_handler
size_t get_byte_range_size_with_padding() except +libcudf_exception_handler
size_t get_byte_range_padding() except +libcudf_exception_handler
char get_delimiter() except +libcudf_exception_handler
bool is_enabled_lines() except +libcudf_exception_handler
bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler
bool is_enabled_prune_columns() except +libcudf_exception_handler
bool is_enabled_dayfirst() except +libcudf_exception_handler
bool is_enabled_experimental() except +libcudf_exception_handler
bool is_enabled_dayfirst() except +libcudf_exception_handler
bool is_enabled_keep_quotes() except +libcudf_exception_handler
bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler
bool is_enabled_normalize_whitespace() except +libcudf_exception_handler
json_recovery_mode_t recovery_mode() except +libcudf_exception_handler
bool is_strict_validation() except +libcudf_exception_handler
bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler
bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler
bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler
vector[string] get_na_values() except +libcudf_exception_handler

# setter
void set_dtypes(
vector[data_type] types
) except +libcudf_exception_handler
void set_dtypes(
map[string, schema_element] types
) except +libcudf_exception_handler
void set_compression(
cudf_io_types.compression_type compression
) except +libcudf_exception_handler
void set_dtypes(vector[data_type] types) except +libcudf_exception_handler
void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler
void set_dtypes(map[string, schema_element] types)\
except +libcudf_exception_handler
void set_dtypes(schema_element types) except +libcudf_exception_handler
void set_compression(cudf_io_types.compression_type comp_type)\
except +libcudf_exception_handler
void set_byte_range_offset(size_t offset) except +libcudf_exception_handler
void set_byte_range_size(size_t size) except +libcudf_exception_handler
void set_delimiter(char delimiter) except +libcudf_exception_handler
void enable_lines(bool val) except +libcudf_exception_handler
void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler
void enable_prune_columns(bool val) except +libcudf_exception_handler
void enable_dayfirst(bool val) except +libcudf_exception_handler
void enable_experimental(bool val) except +libcudf_exception_handler
void enable_dayfirst(bool val) except +libcudf_exception_handler
void enable_keep_quotes(bool val) except +libcudf_exception_handler
void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler

void enable_normalize_whitespace(bool val) except +libcudf_exception_handler
void set_recovery_mode(json_recovery_mode_t val)\
except +libcudf_exception_handler
void set_strict_validation(bool val) except +libcudf_exception_handler
void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler
void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler
void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler
void set_na_values(vector[string] vals) except +libcudf_exception_handler

@staticmethod
json_reader_options_builder builder(
Expand All @@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& dtypes(
map[string, schema_element] types
) except +libcudf_exception_handler
json_reader_options_builder& dtypes(
schema_element types
) except +libcudf_exception_handler
json_reader_options_builder& compression(
cudf_io_types.compression_type compression
) except +libcudf_exception_handler
Expand All @@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& byte_range_size(
size_t size
) except +libcudf_exception_handler
json_reader_options_builder& delimiter(
char delimiter
) except +libcudf_exception_handler
json_reader_options_builder& lines(
bool val
) except +libcudf_exception_handler
Expand All @@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& prune_columns(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& experimental(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& dayfirst(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& keep_quotes(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& normalize_single_quotes(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& normalize_whitespace(
bool val
) except +libcudf_exception_handler
json_reader_options_builder& recovery_mode(
json_recovery_mode_t val
) except +libcudf_exception_handler

json_reader_options_builder& strict_validation(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& numeric_leading_zeros(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& nonnumeric_numbers(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& unquoted_control_chars(bool val)\
except +libcudf_exception_handler
json_reader_options_builder& na_values(vector[string] vals)\
except +libcudf_exception_handler

json_reader_options build() except +libcudf_exception_handler

cdef cudf_io_types.table_with_metadata read_json(
Expand Down
Loading