From 535d1f7f8c7d64ef6d09aaa9285580792d202283 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 25 Sep 2024 23:47:31 +0000 Subject: [PATCH 1/6] bug fix for newline as ws delimiter --- cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/tests/io/json/json_test.cpp | 14 ++ cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 1c15e147b13..bf81162a0ac 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId { constexpr auto pda_sgid_lookup_size = static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); // We map the delimiter character to LINE_BREAK symbol group id, and the newline character - // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, + // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. auto const symbol_position = symbol == delimiter ? static_cast('\n') - : (symbol == '\n' ? static_cast(delimiter) : static_cast(symbol)); + : (symbol == '\n' ? static_cast(' ') : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 48bc982d0e3..d75a1ea4ee5 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2575,6 +2575,20 @@ TEST_F(JsonReaderTest, ViableDelimiter) EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument); } +TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) +{ + // Test input + std::string input = R"({"a": + 100})"; + + cudf::io::json_reader_options json_parser_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) + .lines(true); + + json_parser_options.set_delimiter('\0'); + CUDF_EXPECT_NO_THROW(cudf::io::read_json(json_parser_options)); +} + // Test case for dtype prune: // all paths, only one. // one present, another not present, nothing present diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 327169ae563..90882b703e2 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -1196,4 +1196,182 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter) } } +TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + char const delimiter = GetParam(); + + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as a delimiter for JSONL) + * {"a":2} + * {"a":{"a":{"a":[321{"a":[1]} + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + std::string input = R"({"a":2})" + "\n"; + // starting position 8 (zero indexed) + input += R"({"a":)" + std::string(1, delimiter); + // starting position 14 (zero indexed) + input += R"({"a":{"a":[321)" + std::string(1, delimiter); + // starting position 29 (zero indexed) + input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter); + // starting position 41 (zero indexed) + input += R"({"b":123})" + "\n"; + // starting position 51 (zero indexed) + input += R"({"b":123})" + std::string(1, delimiter); + // starting position 61 (zero indexed) + input += R"({"b")" + std::string("\n:\n\n\n123\n}"); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream; + if (delimiter != '\n') { + golden_token_stream.resize(36); + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 3 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 4 (valid) + {61, token_t::StructBegin}, + {62, token_t::StructMemberBegin}, + {62, token_t::FieldNameBegin}, + {64, token_t::FieldNameEnd}, + {70, token_t::ValueBegin}, + {73, token_t::ValueEnd}, + {74, token_t::StructMemberEnd}, + {74, token_t::StructEnd}}; + } else { + /* Input: + * {"a":2} + * {"a": + * {"a":{"a":[321 + * {"a":[1]} + * + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + golden_token_stream.resize(40); + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 4 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 5 (valid) + {51, token_t::StructBegin}, + {52, token_t::StructMemberBegin}, + {52, token_t::FieldNameBegin}, + {54, token_t::FieldNameEnd}, + {56, token_t::ValueBegin}, + {59, token_t::ValueEnd}, + {59, token_t::StructMemberEnd}, + {59, token_t::StructEnd}, + // Line 6 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}}; + } + + auto const stream = cudf::get_default_stream(); + + // Default parsing options + cudf::io::json_reader_options default_options{}; + default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + default_options.enable_lines(true); + default_options.set_delimiter(delimiter); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, default_options, stream, cudf::get_current_device_resource_ref()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + stream.synchronize(); + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + + CUDF_TEST_PROGRAM_MAIN() From 75840c17b378c4a94ffa791c525a84d936c2e8d2 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 26 Sep 2024 00:02:55 +0000 Subject: [PATCH 2/6] formatting --- cpp/tests/io/json/nested_json_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 90882b703e2..f752453d19f 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -1202,7 +1202,8 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) // 012345678 <= line 0 char const delimiter = GetParam(); - /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as a delimiter for JSONL) + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as + * a delimiter for JSONL) * {"a":2} * {"a":{"a":{"a":[321{"a":[1]} * @@ -1373,5 +1374,4 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) } } - CUDF_TEST_PROGRAM_MAIN() From ef3fddd5d9680e6852838b2d2300f2e3e1bdaa7d Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 26 Sep 2024 17:14:26 +0000 Subject: [PATCH 3/6] pr reviews --- cpp/tests/io/json/nested_json_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index f752453d19f..06301d62cb4 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -1231,7 +1231,6 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) using token_t = cuio_json::token_t; std::vector> golden_token_stream; if (delimiter != '\n') { - golden_token_stream.resize(36); golden_token_stream = {// Line 0 (valid) {0, token_t::StructBegin}, {1, token_t::StructMemberBegin}, @@ -1285,7 +1284,6 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) * {"b":123} * {"b"\n:\n\n\n123\n} */ - golden_token_stream.resize(40); golden_token_stream = {// Line 0 (valid) {0, token_t::StructBegin}, {1, token_t::StructMemberBegin}, From d655b2a42e4c279aed2da585e87ca941338a61c2 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 27 Sep 2024 18:05:19 +0000 Subject: [PATCH 4/6] pr reviews --- cpp/tests/io/json/nested_json_test.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 06301d62cb4..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1341,20 +1342,21 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) auto const stream = cudf::get_default_stream(); - // Default parsing options - cudf::io::json_reader_options default_options{}; - default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); - default_options.enable_lines(true); - default_options.set_delimiter(delimiter); - // Prepare input & output buffers cudf::string_scalar const d_scalar(input, true, stream); auto const d_input = cudf::device_span{ d_scalar.data(), static_cast(d_scalar.size())}; + // Default parsing options + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .delimiter(delimiter) + .lines(true); + // Parse the JSON and get the token stream auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( - d_input, default_options, stream, cudf::get_current_device_resource_ref()); + d_input, in_opts, stream, cudf::get_current_device_resource_ref()); // Copy back the number of tokens that were written auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); From 3d7c758754b030154d3b1e821c67d18c8e84274e Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 27 Sep 2024 18:15:40 +0000 Subject: [PATCH 5/6] fixing test --- cpp/tests/io/json/json_test.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index d75a1ea4ee5..f630a4bf8a4 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2586,7 +2586,17 @@ TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) .lines(true); json_parser_options.set_delimiter('\0'); - CUDF_EXPECT_NO_THROW(cudf::io::read_json(json_parser_options)); + auto result = cudf::io::read_json(json_parser_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 1); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + + auto col1_iterator = thrust::constant_iterator(100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper(col1_iterator, col1_iterator + 1)); } // Test case for dtype prune: From f36371af7249477e23113bac80cfbdc7d844d9ce Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 27 Sep 2024 18:17:44 +0000 Subject: [PATCH 6/6] cleanup --- cpp/tests/io/json/json_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index f630a4bf8a4..6046d0de788 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2583,9 +2583,9 @@ TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) cudf::io::json_reader_options json_parser_options = cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) - .lines(true); + .lines(true) + .delimiter('\0'); - json_parser_options.set_delimiter('\0'); auto result = cudf::io::read_json(json_parser_options); EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->num_rows(), 1);