Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter #16923

Merged
merged 7 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId {
constexpr auto pda_sgid_lookup_size =
static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
// We map the delimiter character to LINE_BREAK symbol group id, and the newline character
// to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
// to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
// escape, comma, colon or whitespace characters.
auto const symbol_position =
symbol == delimiter
? static_cast<int32_t>('\n')
: (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
: (symbol == '\n' ? static_cast<int32_t>(' ') : static_cast<int32_t>(symbol));
PdaSymbolGroupIdT symbol_gid =
tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
Expand Down
24 changes: 24 additions & 0 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter)
EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
}

TEST_F(JsonReaderTest, ViableDelimiterNewlineWS)
{
// Test input
std::string input = R"({"a":
100})";

cudf::io::json_reader_options json_parser_options =
cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
.lines(true)
.delimiter('\0');

auto result = cudf::io::read_json(json_parser_options);
EXPECT_EQ(result.tbl->num_columns(), 1);
EXPECT_EQ(result.tbl->num_rows(), 1);

EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);

EXPECT_EQ(result.metadata.schema_info[0].name, "a");

auto col1_iterator = thrust::constant_iterator<int64_t>(100);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
int64_wrapper(col1_iterator, col1_iterator + 1));
}

// Test case for dtype prune:
// all paths, only one.
// one present, another not present, nothing present
Expand Down
178 changes: 178 additions & 0 deletions cpp/tests/io/json/nested_json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cudf/io/datasource.hpp>
#include <cudf/io/json.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/io/types.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
}
}

TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter)
{
// Test input. Inline comments used to indicate character indexes
// 012345678 <= line 0
char const delimiter = GetParam();

/* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as
* a delimiter for JSONL)
* {"a":2}
* {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
*
* <delimiter>{"b":123}
* {"b":123}<delimiter>
* {"b"\n:\n\n\n123\n}
*/
std::string input = R"({"a":2})"
"\n";
// starting position 8 (zero indexed)
input += R"({"a":)" + std::string(1, delimiter);
// starting position 14 (zero indexed)
input += R"({"a":{"a":[321)" + std::string(1, delimiter);
// starting position 29 (zero indexed)
input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
// starting position 41 (zero indexed)
input += R"({"b":123})"
"\n";
// starting position 51 (zero indexed)
input += R"({"b":123})" + std::string(1, delimiter);
// starting position 61 (zero indexed)
input += R"({"b")" + std::string("\n:\n\n\n123\n}");

// Golden token stream sample
using token_t = cuio_json::token_t;
std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
if (delimiter != '\n') {
golden_token_stream = {// Line 0 (valid)
{0, token_t::StructBegin},
{1, token_t::StructMemberBegin},
{1, token_t::FieldNameBegin},
{3, token_t::FieldNameEnd},
{5, token_t::ValueBegin},
{6, token_t::ValueEnd},
{6, token_t::StructMemberEnd},
{6, token_t::StructEnd},
// Line 1 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 2 (valid)
{29, token_t::StructBegin},
{30, token_t::StructMemberBegin},
{30, token_t::FieldNameBegin},
{32, token_t::FieldNameEnd},
{34, token_t::ListBegin},
{35, token_t::ValueBegin},
{36, token_t::ValueEnd},
{36, token_t::ListEnd},
{37, token_t::StructMemberEnd},
{37, token_t::StructEnd},
// Line 3 (valid)
{41, token_t::StructBegin},
{42, token_t::StructMemberBegin},
{42, token_t::FieldNameBegin},
{44, token_t::FieldNameEnd},
{46, token_t::ValueBegin},
{49, token_t::ValueEnd},
{49, token_t::StructMemberEnd},
{49, token_t::StructEnd},
// Line 4 (valid)
{61, token_t::StructBegin},
{62, token_t::StructMemberBegin},
{62, token_t::FieldNameBegin},
{64, token_t::FieldNameEnd},
{70, token_t::ValueBegin},
{73, token_t::ValueEnd},
{74, token_t::StructMemberEnd},
{74, token_t::StructEnd}};
} else {
/* Input:
* {"a":2}
* {"a":
* {"a":{"a":[321
* {"a":[1]}
*
*
* {"b":123}
* {"b":123}
* {"b"\n:\n\n\n123\n}
*/
golden_token_stream = {// Line 0 (valid)
shrshi marked this conversation as resolved.
Show resolved Hide resolved
{0, token_t::StructBegin},
{1, token_t::StructMemberBegin},
{1, token_t::FieldNameBegin},
{3, token_t::FieldNameEnd},
{5, token_t::ValueBegin},
{6, token_t::ValueEnd},
{6, token_t::StructMemberEnd},
{6, token_t::StructEnd},
// Line 1 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 2 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 3 (valid)
{29, token_t::StructBegin},
{30, token_t::StructMemberBegin},
{30, token_t::FieldNameBegin},
{32, token_t::FieldNameEnd},
{34, token_t::ListBegin},
{35, token_t::ValueBegin},
{36, token_t::ValueEnd},
{36, token_t::ListEnd},
{37, token_t::StructMemberEnd},
{37, token_t::StructEnd},
// Line 4 (valid)
{41, token_t::StructBegin},
{42, token_t::StructMemberBegin},
{42, token_t::FieldNameBegin},
{44, token_t::FieldNameEnd},
{46, token_t::ValueBegin},
{49, token_t::ValueEnd},
{49, token_t::StructMemberEnd},
{49, token_t::StructEnd},
// Line 5 (valid)
{51, token_t::StructBegin},
{52, token_t::StructMemberBegin},
{52, token_t::FieldNameBegin},
{54, token_t::FieldNameEnd},
{56, token_t::ValueBegin},
{59, token_t::ValueEnd},
{59, token_t::StructMemberEnd},
{59, token_t::StructEnd},
// Line 6 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd}};
}

auto const stream = cudf::get_default_stream();

// Prepare input & output buffers
cudf::string_scalar const d_scalar(input, true, stream);
auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
d_scalar.data(), static_cast<size_t>(d_scalar.size())};

// Default parsing options
cudf::io::json_reader_options const in_opts =
cudf::io::json_reader_options::builder(cudf::io::source_info{})
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.delimiter(delimiter)
.lines(true);

// Parse the JSON and get the token stream
auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a way to verify correctness using the output of read_json?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the JsonReaderTest.ViableDelimiterNewlineWS gtest in json_test.cpp verifies the correctness of the table created by read_json for null delimiter.
Since this delimiter bug fix only affects the PDA in the tokenization step, I wanted to add an additional test that directly checks the output of get_token_stream.

d_input, in_opts, stream, cudf::get_current_device_resource_ref());
// Copy back the number of tokens that were written
auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);

stream.synchronize();
// Verify the number of tokens matches
ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());

for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
// Ensure the index the tokens are pointing to do match
EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
// Ensure the token category is correct
EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
}
}

CUDF_TEST_PROGRAM_MAIN()
Loading