From f20491d3366808e5c62dcee2160fc8a9d5e50fa7 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 30 Sep 2024 12:14:32 -0400 Subject: [PATCH] Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter (#16950) Backporting PR #16923: : Parse newline as whitespace character while tokenizing JSONL inputs Addresses #16915 --- cpp/src/io/json/nested_json_gpu.cu | 8 +- cpp/tests/io/json/json_test.cpp | 24 ++++ cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 1c15e147b13..76816071d8c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -618,12 +618,14 @@ struct PdaSymbolToSymbolGroupId { constexpr auto pda_sgid_lookup_size = static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); // We map the delimiter character to LINE_BREAK symbol group id, and the newline character - // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, + // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. + auto constexpr newline = '\n'; + auto constexpr whitespace = ' '; auto const symbol_position = symbol == delimiter - ? static_cast('\n') - : (symbol == '\n' ? static_cast(delimiter) : static_cast(symbol)); + ? static_cast(newline) + : (symbol == newline ? static_cast(whitespace) : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 68ec255b39d..a094ac7d772 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter) EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument); } +TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) +{ + // Test input + std::string input = R"({"a": + 100})"; + + cudf::io::json_reader_options json_parser_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) + .lines(true) + .delimiter('\0'); + + auto result = cudf::io::read_json(json_parser_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 1); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + + auto col1_iterator = thrust::constant_iterator(100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper(col1_iterator, col1_iterator + 1)); +} + // Test case for dtype prune: // all paths, only one. // one present, another not present, nothing present diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 327169ae563..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter) } } +TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + char const delimiter = GetParam(); + + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as + * a delimiter for JSONL) + * {"a":2} + * {"a":{"a":{"a":[321{"a":[1]} + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + std::string input = R"({"a":2})" + "\n"; + // starting position 8 (zero indexed) + input += R"({"a":)" + std::string(1, delimiter); + // starting position 14 (zero indexed) + input += R"({"a":{"a":[321)" + std::string(1, delimiter); + // starting position 29 (zero indexed) + input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter); + // starting position 41 (zero indexed) + input += R"({"b":123})" + "\n"; + // starting position 51 (zero indexed) + input += R"({"b":123})" + std::string(1, delimiter); + // starting position 61 (zero indexed) + input += R"({"b")" + std::string("\n:\n\n\n123\n}"); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream; + if (delimiter != '\n') { + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 3 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 4 (valid) + {61, token_t::StructBegin}, + {62, token_t::StructMemberBegin}, + {62, token_t::FieldNameBegin}, + {64, token_t::FieldNameEnd}, + {70, token_t::ValueBegin}, + {73, token_t::ValueEnd}, + {74, token_t::StructMemberEnd}, + {74, token_t::StructEnd}}; + } else { + /* Input: + * {"a":2} + * {"a": + * {"a":{"a":[321 + * {"a":[1]} + * + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 4 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 5 (valid) + {51, token_t::StructBegin}, + {52, token_t::StructMemberBegin}, + {52, token_t::FieldNameBegin}, + {54, token_t::FieldNameEnd}, + {56, token_t::ValueBegin}, + {59, token_t::ValueEnd}, + {59, token_t::StructMemberEnd}, + {59, token_t::StructEnd}, + // Line 6 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}}; + } + + auto const stream = cudf::get_default_stream(); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Default parsing options + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .delimiter(delimiter) + .lines(true); + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, in_opts, stream, cudf::get_current_device_resource_ref()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + stream.synchronize(); + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + CUDF_TEST_PROGRAM_MAIN()