Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON reader validation of values #15968

Merged
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
bb991ef
validation of tokens code
karthikeyann Jun 11, 2024
4e707cb
fix pre-commit check failures
karthikeyann Jun 18, 2024
35a8268
Merge branch 'branch-24.08' into fea-json_spark_validation
karthikeyann Jun 18, 2024
cd6a30f
Merge branch 'branch-24.08' into fea-json_spark_validation
karthikeyann Jun 27, 2024
0c2e4da
Add Spark Compatible JSON validation (#10)
revans2 Aug 2, 2024
6a38578
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into fea-json…
karthikeyann Aug 2, 2024
0d6cb12
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Aug 2, 2024
dfa6b18
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 9, 2024
e944937
style fixes
karthikeyann Aug 9, 2024
23072c0
Update json normalization to take device_buffer
karthikeyann Aug 9, 2024
a885340
fix char comparison error
karthikeyann Aug 9, 2024
3867c61
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 9, 2024
ab1385d
update char comparison
karthikeyann Aug 15, 2024
80c7c3a
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 26, 2024
f2e2b44
rename to tabulate_output_iterator.cuh
karthikeyann Aug 26, 2024
0963218
absorb counting_iterator to tabulate_output_iterator
karthikeyann Aug 26, 2024
be7402c
update documentation
karthikeyann Aug 26, 2024
b114401
add na_values to validation
karthikeyann Aug 26, 2024
a1e9afc
add strict validation to test
karthikeyann Aug 26, 2024
ec78ef9
rename tabulate_output_iterator namespace
karthikeyann Aug 26, 2024
a225ce0
remove comments and notes
karthikeyann Aug 26, 2024
7a2a451
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 26, 2024
875a72b
fix unsigned/signed issue with ARM systems
karthikeyann Sep 3, 2024
ef6f298
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 3, 2024
be7f17e
remove comments
karthikeyann Sep 3, 2024
fb62877
fix condition
karthikeyann Sep 4, 2024
e4f7d04
fix char issue with typecast
karthikeyann Sep 5, 2024
851fe3e
Update cpp/include/cudf/io/json.hpp
karthikeyann Sep 5, 2024
35e4b89
Update cpp/include/cudf/io/json.hpp
karthikeyann Sep 5, 2024
3681823
address review comments
karthikeyann Sep 5, 2024
1d897f7
fix doc
karthikeyann Sep 5, 2024
e1435ce
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 5, 2024
6bf4d3f
address review comments
karthikeyann Sep 5, 2024
e9ebb91
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 6, 2024
e093d64
address review comments
karthikeyann Sep 9, 2024
00ef690
rename lambda name
karthikeyann Sep 9, 2024
86bbeab
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 9, 2024
cecb42f
Apply suggestions from code review
karthikeyann Sep 10, 2024
9cd3098
Apply suggestions from code review
karthikeyann Sep 10, 2024
c816c73
update docs
karthikeyann Sep 10, 2024
53db703
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
c3832b6
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
070263e
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
fb0e85f
fix strict_validation dependent options with if
karthikeyann Sep 10, 2024
e7fce07
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 10, 2024
252c38b
fix typo
karthikeyann Sep 10, 2024
5ab337b
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ add_library(
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
src/io/json/parser_features.cpp
src/io/json/process_tokens.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
190 changes: 190 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>
Expand Down Expand Up @@ -130,6 +131,19 @@ class json_reader_options {
// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

// Validation checks for spark
// Should the json validation be strict or not
// Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
bool _strict_validation = false;
// Allow leading zeros for numeric values.
bool _allow_numeric_leading_zeros = true;
// Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
bool _allow_nonnumeric_numbers = true;
// Allow unquoted control characters
bool _allow_unquoted_control_chars = true;
// Additional values to recognize as null values
std::vector<std::string> _na_values;

/**
* @brief Constructor from source info.
*
Expand Down Expand Up @@ -300,6 +314,55 @@ class json_reader_options {
*/
[[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }

/**
* @brief Whether json validation should be enforced strictly or not.
*
* @return true if it should be.
*/
[[nodiscard]] bool is_strict_validation() const { return _strict_validation; }

/**
* @brief Whether leading zeros are allowed in numeric values.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_numeric_leading_zeros() const
{
return _allow_numeric_leading_zeros;
}

/**
* @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
* and -Infinity.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }

/**
* @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
* be allowed without some form of escaping.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if unquoted control chars are allowed.
*/
[[nodiscard]] bool is_allowed_unquoted_control_chars() const
{
return _allow_unquoted_control_chars;
}

/**
* @brief Returns additional values to recognize as null values.
*
* @return Additional values to recognize as null values
*/
[[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }

/**
* @brief Set data types for columns to be read.
*
Expand Down Expand Up @@ -429,6 +492,63 @@ class json_reader_options {
* @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
*/
void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }

/**
* @brief Set whether strict validation is enabled or not.
*
* @param val Boolean value to indicate whether strict validation is enabled.
*/
void set_strict_validation(bool val) { _strict_validation = val; }

/**
* @brief Set whether leading zeros are allowed in numeric values. Strict validation
* must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_numeric_leading_zeros(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_numeric_leading_zeros = val;
}

/**
* @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
* Infinity, and -Infinity. Strict validation must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_nonnumeric_numbers(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_nonnumeric_numbers = val;
}

/**
* @brief Set whether in a quoted string should characters greater than or equal to 0
* and less than 32 be allowed without some form of escaping. Strict validation must
* be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val true to indicate whether unquoted control chars are allowed.
*/
void allow_unquoted_control_chars(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_unquoted_control_chars = val;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
*/
void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
};

/**
Expand Down Expand Up @@ -640,6 +760,76 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether json validation should be strict or not.
*
* @param val Boolean value to indicate whether json validation should be strict or not.
* @return this for chaining
*/
json_reader_options_builder& strict_validation(bool val)
{
options.set_strict_validation(val);
return *this;
}

/**
* @brief Set Whether leading zeros are allowed in numeric values. strict validation must
* be enabled for this to have any effect.
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
* @return this for chaining
*/
json_reader_options_builder& numeric_leading_zeros(bool val)
{
options.allow_numeric_leading_zeros(val);
return *this;
}

/**
* @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
* +INF, -INF, +Infinity, Infinity, and -Infinity.
* strict validation must be enabled for this to have any effect.
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
* @return this for chaining
*/
json_reader_options_builder& nonnumeric_numbers(bool val)
{
options.allow_nonnumeric_numbers(val);
return *this;
}

/**
* @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
* some form of escaping. strict validation must be enabled for this to have any effect.
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted control chars are allowed or not.
* @return this for chaining
*/
json_reader_options_builder& unquoted_control_chars(bool val)
{
options.allow_unquoted_control_chars(val);
return *this;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
* @return this for chaining
*/
json_reader_options_builder& na_values(std::vector<std::string> vals)
{
options.set_na_values(std::move(vals));
return *this;
}

/**
* @brief move json_reader_options member once it's built.
*/
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "io/fst/lookup_tables.cuh"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/detail/json.hpp>
#include <cudf/types.hpp>

Expand Down Expand Up @@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down Expand Up @@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
device_span<SymbolOffsetT const> token_indices,
rmm::cuda_stream_view stream);

/**
* @brief Validate the tokens conforming to behavior given in options.
*
* @param d_input The string of input characters
* @param tokens The tokens to be post-processed
* @param token_indices The tokens' corresponding indices that are post-processed
* @param options Parsing options specifying the parsing behaviour
* @param stream The cuda stream to dispatch GPU kernels to
*/
void validate_token_stream(device_span<char const> d_input,
device_span<PdaTokenT> tokens,
device_span<SymbolOffsetT> token_indices,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream);

/**
* @brief Parses the given JSON string and generates a tree representation of the given input.
*
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge

if (delimiter_offset == 1) {
tokens.set_element(0, token_t::LineEnd, stream);
validate_token_stream(json_in, tokens, tokens_indices, options, stream);
auto [filtered_tokens, filtered_tokens_indices] =
process_token_stream(tokens, tokens_indices, stream);
tokens = std::move(filtered_tokens);
Expand Down Expand Up @@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
parse_opts.keepquotes = options.is_enabled_keep_quotes();
parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream);
parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream);
std::vector<std::string> na_values{"", "null"};
na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
return parse_opts;
}

Expand Down
Loading
Loading