diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 89af00c713d..4f0709ec985 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -91,11 +91,6 @@ def read_json( if dtype is None: dtype = True - if kwargs: - raise ValueError( - "cudf engine doesn't support the " - f"following keyword arguments: {list(kwargs.keys())}" - ) if args: raise ValueError( "cudf engine doesn't support the " @@ -198,6 +193,7 @@ def read_json( mixed_types_as_string=mixed_types_as_string, prune_columns=prune_columns, recovery_mode=c_on_bad_lines, + extra_parameters=kwargs, ) df = cudf.DataFrame._from_data( diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index f65c1034598..d7726971351 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -21,6 +21,7 @@ cpdef TableWithMetadata read_json( bool mixed_types_as_string = *, bool prune_columns = *, json_recovery_mode_t recovery_mode = *, + dict extra_parameters = *, ) diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index ad2989925c9..32f737fbff4 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -57,8 +57,10 @@ cdef json_reader_options _setup_json_reader_options( bool keep_quotes, bool mixed_types_as_string, bool prune_columns, - json_recovery_mode_t recovery_mode): + json_recovery_mode_t recovery_mode, + dict extra_parameters=None): + cdef vector[string] na_vec cdef vector[data_type] types_vec cdef json_reader_options opts = ( json_reader_options.builder(source_info.c_obj) @@ -81,6 +83,39 @@ cdef json_reader_options _setup_json_reader_options( opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + if extra_parameters is not None: + for key, value in extra_parameters.items(): + if key == 'delimiter': + opts.set_delimiter(ord(value)) + elif key == 'dayfirst': + opts.enable_dayfirst(value) + elif key == 'experimental': + opts.enable_experimental(value) + elif key == 'normalize_single_quotes': + opts.enable_normalize_single_quotes(value) + elif key == 'normalize_whitespace': + opts.enable_normalize_whitespace(value) + elif key == 'strict_validation': + opts.set_strict_validation(value) + elif key == 'allow_unquoted_control_chars': + opts.allow_unquoted_control_chars(value) + elif key == 'allow_numeric_leading_zeros': + opts.allow_numeric_leading_zeros(value) + elif key == 'allow_nonnumeric_numbers': + opts.allow_nonnumeric_numbers(value) + elif key == 'na_values': + for na_val in value: + if isinstance(na_val, str): + na_vec.push_back(na_val.encode()) + opts.set_na_values(na_vec) + else: + raise ValueError( + "cudf engine doesn't support the " + f"'{key}' keyword argument for read_json" + ) return opts @@ -196,6 +231,7 @@ cpdef TableWithMetadata read_json( bool mixed_types_as_string = False, bool prune_columns = False, json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + dict extra_parameters = None, ): """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. @@ -227,6 +263,8 @@ cpdef TableWithMetadata read_json( recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL Whether to raise an error or set corresponding values to null when encountering an invalid JSON line. + extra_parameters : dict, default None + Additional hidden parameters to pass to the JSON reader. Returns ------- @@ -244,6 +282,7 @@ cpdef TableWithMetadata read_json( mixed_types_as_string=mixed_types_as_string, prune_columns=prune_columns, recovery_mode=recovery_mode, + extra_parameters=extra_parameters, ) # Read JSON diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd index a7ca6978621..c241c478f25 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd @@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \ cdef struct schema_element: data_type type map[string, schema_element] child_types + optional[vector[string]] column_order cpdef enum class json_recovery_mode_t(int32_t): FAIL @@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \ except +libcudf_exception_handler size_t get_byte_range_offset() except +libcudf_exception_handler size_t get_byte_range_size() except +libcudf_exception_handler + size_t get_byte_range_size_with_padding() except +libcudf_exception_handler + size_t get_byte_range_padding() except +libcudf_exception_handler + char get_delimiter() except +libcudf_exception_handler bool is_enabled_lines() except +libcudf_exception_handler bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler bool is_enabled_prune_columns() except +libcudf_exception_handler - bool is_enabled_dayfirst() except +libcudf_exception_handler bool is_enabled_experimental() except +libcudf_exception_handler + bool is_enabled_dayfirst() except +libcudf_exception_handler + bool is_enabled_keep_quotes() except +libcudf_exception_handler + bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler + bool is_enabled_normalize_whitespace() except +libcudf_exception_handler + json_recovery_mode_t recovery_mode() except +libcudf_exception_handler + bool is_strict_validation() except +libcudf_exception_handler + bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler + bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler + bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler + vector[string] get_na_values() except +libcudf_exception_handler # setter - void set_dtypes( - vector[data_type] types - ) except +libcudf_exception_handler - void set_dtypes( - map[string, schema_element] types - ) except +libcudf_exception_handler - void set_compression( - cudf_io_types.compression_type compression - ) except +libcudf_exception_handler + void set_dtypes(vector[data_type] types) except +libcudf_exception_handler + void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler + void set_dtypes(map[string, schema_element] types)\ + except +libcudf_exception_handler + void set_dtypes(schema_element types) except +libcudf_exception_handler + void set_compression(cudf_io_types.compression_type comp_type)\ + except +libcudf_exception_handler void set_byte_range_offset(size_t offset) except +libcudf_exception_handler void set_byte_range_size(size_t size) except +libcudf_exception_handler + void set_delimiter(char delimiter) except +libcudf_exception_handler void enable_lines(bool val) except +libcudf_exception_handler void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler void enable_prune_columns(bool val) except +libcudf_exception_handler - void enable_dayfirst(bool val) except +libcudf_exception_handler void enable_experimental(bool val) except +libcudf_exception_handler + void enable_dayfirst(bool val) except +libcudf_exception_handler void enable_keep_quotes(bool val) except +libcudf_exception_handler + void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler + + void enable_normalize_whitespace(bool val) except +libcudf_exception_handler + void set_recovery_mode(json_recovery_mode_t val)\ + except +libcudf_exception_handler + void set_strict_validation(bool val) except +libcudf_exception_handler + void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler + void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler + void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler + void set_na_values(vector[string] vals) except +libcudf_exception_handler @staticmethod json_reader_options_builder builder( @@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dtypes( map[string, schema_element] types ) except +libcudf_exception_handler + json_reader_options_builder& dtypes( + schema_element types + ) except +libcudf_exception_handler json_reader_options_builder& compression( cudf_io_types.compression_type compression ) except +libcudf_exception_handler @@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& byte_range_size( size_t size ) except +libcudf_exception_handler + json_reader_options_builder& delimiter( + char delimiter + ) except +libcudf_exception_handler json_reader_options_builder& lines( bool val ) except +libcudf_exception_handler @@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& prune_columns( bool val ) except +libcudf_exception_handler + json_reader_options_builder& experimental( + bool val + ) except +libcudf_exception_handler json_reader_options_builder& dayfirst( bool val ) except +libcudf_exception_handler json_reader_options_builder& keep_quotes( bool val ) except +libcudf_exception_handler + json_reader_options_builder& normalize_single_quotes( + bool val + ) except +libcudf_exception_handler + json_reader_options_builder& normalize_whitespace( + bool val + ) except +libcudf_exception_handler json_reader_options_builder& recovery_mode( json_recovery_mode_t val ) except +libcudf_exception_handler + json_reader_options_builder& strict_validation(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& numeric_leading_zeros(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& nonnumeric_numbers(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& unquoted_control_chars(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& na_values(vector[string] vals)\ + except +libcudf_exception_handler + json_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_json(