diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1040fcb7b91..7bc01e64441 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -384,6 +384,7 @@ add_library( src/io/json/nested_json_gpu.cu src/io/json/read_json.cu src/io/json/parser_features.cpp + src/io/json/process_tokens.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a3d6533705e..ff25a5bacae 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -128,6 +129,19 @@ class json_reader_options { // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + // Validation checks for spark + // Should the json validation be strict or not + // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html + bool _strict_validation = false; + // Allow leading zeros for numeric values. + bool _allow_numeric_leading_zeros = true; + // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity + bool _allow_nonnumeric_numbers = true; + // Allow unquoted control characters + bool _allow_unquoted_control_chars = true; + // Additional values to recognize as null values + std::vector _na_values; + /** * @brief Constructor from source info. * @@ -298,6 +312,55 @@ class json_reader_options { */ [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** + * @brief Whether json validation should be enforced strictly or not. + * + * @return true if it should be. + */ + [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } + + /** + * @brief Whether leading zeros are allowed in numeric values. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_numeric_leading_zeros() const + { + return _allow_numeric_leading_zeros; + } + + /** + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, + * and -Infinity. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + + /** + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 + * be allowed without some form of escaping. + * + * @note: This validation is enforced only if strict validation is enabled. + * + * @return true if unquoted control chars are allowed. + */ + [[nodiscard]] bool is_allowed_unquoted_control_chars() const + { + return _allow_unquoted_control_chars; + } + + /** + * @brief Returns additional values to recognize as null values. + * + * @return Additional values to recognize as null values + */ + [[nodiscard]] std::vector const& get_na_values() const { return _na_values; } + /** * @brief Set data types for columns to be read. * @@ -427,6 +490,63 @@ class json_reader_options { * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. */ void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } + + /** + * @brief Set whether strict validation is enabled or not. + * + * @param val Boolean value to indicate whether strict validation is enabled. + */ + void set_strict_validation(bool val) { _strict_validation = val; } + + /** + * @brief Set whether leading zeros are allowed in numeric values. Strict validation + * must be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_numeric_leading_zeros(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_numeric_leading_zeros = val; + } + + /** + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * Infinity, and -Infinity. Strict validation must be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_nonnumeric_numbers(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_nonnumeric_numbers = val; + } + + /** + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must + * be enabled for this to work. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val true to indicate whether unquoted control chars are allowed. + */ + void allow_unquoted_control_chars(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_unquoted_control_chars = val; + } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + */ + void set_na_values(std::vector vals) { _na_values = std::move(vals); } }; /** @@ -638,6 +758,76 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether json validation should be strict or not. + * + * @param val Boolean value to indicate whether json validation should be strict or not. + * @return this for chaining + */ + json_reader_options_builder& strict_validation(bool val) + { + options.set_strict_validation(val); + return *this; + } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must + * be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + * @return this for chaining + */ + json_reader_options_builder& numeric_leading_zeros(bool val) + { + options.allow_numeric_leading_zeros(val); + return *this; + } + + /** + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, + * +INF, -INF, +Infinity, Infinity, and -Infinity. + * Strict validation must be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not. + * @return this for chaining + */ + json_reader_options_builder& nonnumeric_numbers(bool val) + { + options.allow_nonnumeric_numbers(val); + return *this; + } + + /** + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * some form of escaping. Strict validation must be enabled for this to have any effect. + * + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. + * + * @param val Boolean value to indicate if unquoted control chars are allowed or not. + * @return this for chaining + */ + json_reader_options_builder& unquoted_control_chars(bool val) + { + options.allow_unquoted_control_chars(val); + return *this; + } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + * @return this for chaining + */ + json_reader_options_builder& na_values(std::vector vals) + { + options.set_na_values(std::move(vals)); + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 7899ea7bac4..97d5884fef1 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include #include @@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer& inda rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = @@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index b06458e1a8e..75639a0438f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -225,6 +225,21 @@ std::pair, rmm::device_uvector> pr device_span token_indices, rmm::cuda_stream_view stream); +/** + * @brief Validate the tokens conforming to behavior given in options. + * + * @param d_input The string of input characters + * @param tokens The tokens to be post-processed + * @param token_indices The tokens' corresponding indices that are post-processed + * @param options Parsing options specifying the parsing behaviour + * @param stream The cuda stream to dispatch GPU kernels to + */ +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream); + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index d76e5447c30..4e513d3495c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1660,6 +1660,7 @@ std::pair, rmm::device_uvector> ge if (delimiter_offset == 1) { tokens.set_element(0, token_t::LineEnd, stream); + validate_token_stream(json_in, tokens, tokens_indices, options, stream); auto [filtered_tokens, filtered_tokens_indices] = process_token_stream(tokens, tokens_indices, stream); tokens = std::move(filtered_tokens); @@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + std::vector na_values{"", "null"}; + na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); + parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); return parse_opts; } diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu new file mode 100644 index 00000000000..83c7b663980 --- /dev/null +++ b/cpp/src/io/json/process_tokens.cu @@ -0,0 +1,310 @@ + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/trie.cuh" +#include "nested_json.hpp" +#include "tabulate_output_iterator.cuh" + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::io::json { +namespace detail { + +struct write_if { + using token_t = cudf::io::json::token_t; + using scan_type = thrust::pair; + PdaTokenT* tokens; + size_t n; + // Index, value + __device__ void operator()(size_type i, scan_type x) + { + if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) { + if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) { + tokens[i] = token_t::ErrorBegin; + } + } + } +}; + +enum class number_state { + START = 0, + SAW_NEG, // not a complete state + LEADING_ZERO, + WHOLE, + SAW_RADIX, // not a complete state + FRACTION, + START_EXPONENT, // not a complete state + AFTER_SIGN_EXPONENT, // not a complete state + EXPONENT +}; + +enum class string_state { + NORMAL = 0, + ESCAPED, // not a complete state + ESCAPED_U // not a complete state +}; + +__device__ inline bool substr_eq(const char* data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char* expected) +{ + if (end - start != expected_len) { return false; } + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { return false; } + } + return true; +} + +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + if (!options.is_strict_validation()) { return; } + using token_t = cudf::io::json::token_t; + cudf::detail::optional_trie trie_na = + cudf::detail::create_serialized_trie(options.get_na_values(), stream); + auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto validate_values = cuda::proclaim_return_type( + [data = d_input.data(), + trie_na = trie_na_view, + allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); + if (is_null_literal) { + return true; + } else if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || + substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::START; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::START: + if ('-' == c) { + num_state = number_state::SAW_NEG; + } else if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else { + return false; + } + break; + case number_state::SAW_NEG: + if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || + substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::LEADING_ZERO: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::WHOLE: + if (c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::SAW_RADIX: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::FRACTION: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::START_EXPONENT: + if ('+' == c || '-' == c) { + num_state = number_state::AFTER_SIGN_EXPONENT; + } else if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::AFTER_SIGN_EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + } + } + return num_state != number_state::AFTER_SIGN_EXPONENT && + num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG && + num_state != number_state::SAW_RADIX; + } else { + return false; + } + }); + + auto validate_strings = cuda::proclaim_return_type( + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars + + auto state = string_state::NORMAL; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { + return false; + } + + switch (state) { + case string_state::NORMAL: + if (c == '\\') { state = string_state::ESCAPED; } + break; + case string_state::ESCAPED: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::ESCAPED_U; + u_count = 0; + } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || + c == 'r' || c == 't') { + state = string_state::NORMAL; + } else { + return false; + } + break; + case string_state::ESCAPED_U: + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::NORMAL; + u_count = 0; + } + } else { + return false; + } + break; + } + } + return string_state::NORMAL == state; + }); + + auto num_tokens = tokens.size(); + auto count_it = thrust::make_counting_iterator(0); + auto predicate = [tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { + if (tokens[i] == token_t::ValueEnd) { + return !validate_values(token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { + return !validate_strings(token_indices[i - 1], token_indices[i]); + } + return false; + }; + + using scan_type = write_if::scan_type; + auto conditional_write = write_if{tokens.begin(), num_tokens}; + auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); + auto transform_op = cuda::proclaim_return_type( + [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; + }); + auto binary_op = cuda::proclaim_return_type( + [] __device__(scan_type prev, scan_type curr) -> scan_type { + auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); + return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + }); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + count_it, + count_it + num_tokens, + conditional_output_it, + transform_op, + binary_op); // in-place scan +} +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh new file mode 100644 index 00000000000..7cf3655e259 --- /dev/null +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator +template +class tabulate_output_iterator_proxy { + public: + __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun) + : index(index), fun(fun) + { + } + template + __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value) + { + fun(index, rhs_value); + return *this; + } + + private: + IndexT index; + BinaryFunction fun; +}; + +/** + * @brief Tabulate output iterator with custom binary function which takes index and value. + * + * @code {.cpp} + * #include "tabulate_output_iterator.cuh" + * #include + * #include + * #include + * + * struct set_bits_field { + * int* bitfield; + * __device__ inline void set_bit(size_t bit_index) + * { + * atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32))); + * } + * __device__ inline void clear_bit(size_t bit_index) + * { + * atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32))); + * } + * // Index, value + * __device__ void operator()(size_t i, bool x) + * { + * if (x) + * set_bit(i); + * else + * clear_bit(i); + * } + * }; + * + * thrust::device_vector v(1, 0x00000000); + * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()}); + * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + * [] __device__ (int x) { return x%2; }); + * thrust::copy(thrust::device, value, value+32, result_begin); + * assert(v[0] == 0xaaaaaaaa); + * @endcode + * + * + * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of + * assignment operator. + * @tparam Iterator iterator type that acts as index of the output. + */ +template +class tabulate_output_iterator + : public thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy> { + public: + // parent class. + using super_t = thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy>; + // friend thrust::iterator_core_access to allow it access to the private interface dereference() + friend class thrust::iterator_core_access; + __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {} + + private: + BinaryFunction fun; + + // thrust::iterator_core_access accesses this function + __host__ __device__ typename super_t::reference dereference() const + { + return tabulate_output_iterator_proxy(*this->base(), fun); + } +}; + +template +tabulate_output_iterator __host__ __device__ +make_tabulate_output_iterator(BinaryFunction fun) +{ + return tabulate_output_iterator(fun); +} // end make_tabulate_output_iterator + +} // namespace detail +} // namespace cudf + +// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from +// type_traits to enable its use with algorithms. +template +struct thrust::detail::is_proxy_reference< + cudf::detail::tabulate_output_iterator_proxy> + : public thrust::detail::true_type {}; diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index c26e5ca3edb..960c19fce2e 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2180,6 +2180,86 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) cudf::set_pinned_memory_resource(last_mr); } +// Validation +TEST_F(JsonReaderTest, ValueValidation) +{ + // parsing error as null rows + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2 }{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid/null based on option) + R"({"a": 1, "c":nan, "d": "null" } )" + "\n" + "\n" + // 4 -> (valid/null based on option) + R"({"a":04, "c": 1.23, "d": "abc"} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> ((valid/null based on option) + R"({"a":06} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + // leadingZeros allowed + // na_values, + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, + {true, false, false, false, true, true, true, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0}, + {false, false, false, false, true, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } + // leadingZeros not allowed, NaN allowed + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true) + .numeric_leading_zeros(false) + .na_values({"nan"}); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, + {true, false, false, true, false, true, false, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0}, + {false, false, false, false, false, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } +} + TEST_F(JsonReaderTest, MixedTypes) { using LCWS = cudf::test::lists_column_wrapper; diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b37d0d88ec9..c8308ca17ec 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; + private final boolean strictValidation; + private final boolean allowLeadingZeros; + private final boolean allowNonNumericNumbers; + private final boolean allowUnquotedControlChars; private JSONOptions(Builder builder) { super(builder); @@ -44,6 +48,10 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + strictValidation = builder.strictValidation; + allowLeadingZeros = builder.allowLeadingZeros; + allowNonNumericNumbers = builder.allowNonNumericNumbers; + allowUnquotedControlChars = builder.allowUnquotedControlChars; } public boolean isDayFirst() { @@ -75,6 +83,22 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean strictValidation() { + return strictValidation; + } + + public boolean leadingZerosAllowed() { + return allowLeadingZeros; + } + + public boolean nonNumericNumbersAllowed() { + return allowNonNumericNumbers; + } + + public boolean unquotedControlChars() { + return allowUnquotedControlChars; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -85,6 +109,10 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + private boolean strictValidation = false; + private boolean allowUnquotedControlChars = true; + private boolean allowNonNumericNumbers = false; + private boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -95,10 +123,45 @@ public static final class Builder extends ColumnFilterOptions.Builder(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); - + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1652,17 +1661,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, - jclass, - jlong buffer, - jlong buffer_length, - jboolean day_first, - jboolean lines, - jboolean recover_with_null, - jboolean normalize_single_quotes, - jboolean normalize_whitespace, - jboolean mixed_types_as_string, - jboolean keep_quotes) +JNIEXPORT jlong JNICALL +Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, + jclass, + jlong buffer, + jlong buffer_length, + jboolean day_first, + jboolean lines, + jboolean recover_with_null, + jboolean normalize_single_quotes, + jboolean normalize_whitespace, + jboolean mixed_types_as_string, + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1684,8 +1698,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1790,6 +1810,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1824,7 +1848,13 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -1874,7 +1904,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { bool read_buffer = true; if (buffer == 0) { @@ -1923,7 +1957,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .keep_quotes(keep_quotes); + if (strict_validation) { + opts.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 050bcbb268f..56fe63598d9 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException { } } + @Test void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { } } + private static final byte[] JSON_VALIDATION_BUFFER = ( + "{\"a\":true}\n" + + "{\"a\":false}\n" + + "{\"a\":null}\n" + + "{\"a\":true, \"b\":truee}\n" + + "{\"a\":true, \"b\":\"nulll\"}\n" + + "{\"a\": 1}\n" + + "{\"a\": 0}\n" + + "{\"a\": -}\n" + + "{\"a\": -0}\n" + + "{\"a\": -01}\n" + + + "{\"a\": 01}\n" + + "{\"a\": -0.1}\n" + + "{\"a\": -00.1}\n" + + "{\"a\": NaN}\n" + + "{\"a\": INF}\n" + + "{\"a\": +INF}\n" + + "{\"a\": -INF}\n" + + "{\"a\": +Infinity}\n" + + "{\"a\": Infinity}\n" + + "{\"a\": -Infinity}\n" + + + "{\"a\": INFinity}\n" + + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + + "{\"a\":12.}\n" + + "{\"a\": -3.4e+38}\n" + + "{\"a\": -3.4e-38}\n" + + "{\"a\": 1.4e38}\n" + + "{\"a\": -3.4E+38}\n" + + "{\"a\": -3.4E-38}\n" + + "{\"a\": 1.4E38}\n" + + "{\"a\": -3.4E+}\n" + + + "{\"a\": -3.4E-}\n" + + "{\"a\": \"A\u0000B\"}\n" + + "{\"a\": \"A\\u0000B\"}\n" + + "{\"a\": \"A\u0001B\"}\n" + + "{\"a\": \"A\\u0001B\"}\n" + + "{\"a\": \"A\u001FB\"}\n" + + "{\"a\": \"A\\u001FB\"}\n" + + "{\"a\": \"A\u0020B\"}\n" + + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" + + "{\"a\": \"something\", \"b\": \"\\z\"}\n" + ).getBytes(StandardCharsets.UTF_8); + + @Test + void testJSONValidationNoStrict() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(false) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", "\"something\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidation() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationLeadingZeros() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(true) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1", null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationNonNumeric() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(true) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationUnquotedControl() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" +