Skip to content

Commit

Permalink
Fix JSON parsing memory corruption - Fix Mixed types nested children …
Browse files Browse the repository at this point in the history
…removal (#15798)

Fixes #15750
The references of deleted child columns are not removed, which caused segfault, and also memory errors (found with valgrind). This fix removes references of child columns and deletes them recursively.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: #15798
  • Loading branch information
karthikeyann authored May 31, 2024
1 parent 789cbfd commit 476db9f
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 2 deletions.
17 changes: 15 additions & 2 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,7 @@ void make_device_json_column(device_span<SymbolT const> input,
col.validity =
cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
col.type = json_col_t::StringColumn;
col.child_columns.clear(); // their references should be deleted too.
col.column_order.clear();
// destroy references of all child columns after this step, by calling remove_child_columns
};

path_from_tree tree_path{column_categories,
Expand Down Expand Up @@ -628,6 +627,19 @@ void make_device_json_column(device_span<SymbolT const> input,
std::vector<uint8_t> is_pruned(num_columns, 0);
columns.try_emplace(parent_node_sentinel, std::ref(root));

std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
[&](NodeIndexT this_col_id, device_json_column& col) {
for (auto col_name : col.column_order) {
auto child_id = mapped_columns[{this_col_id, col_name}];
is_mixed_type_column[child_id] = 1;
remove_child_columns(child_id, col.child_columns.at(col_name));
mapped_columns.erase({this_col_id, col_name});
columns.erase(child_id);
}
col.child_columns.clear(); // their references are deleted above.
col.column_order.clear();
};

auto name_and_parent_index = [&is_array_of_arrays,
&row_array_parent_col_id,
&column_parent_ids,
Expand Down Expand Up @@ -721,6 +733,7 @@ void make_device_json_column(device_span<SymbolT const> input,
auto& col = columns.at(old_col_id).get();
if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
reinitialize_as_string(old_col_id, col);
remove_child_columns(old_col_id, col);
// all its children (which are already inserted) are ignored later.
}
col.forced_as_string_column = true;
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
}
}

TEST_F(JsonReaderTest, JSONMixedTypeChildren)
{
std::string const json_str = R"(
{ "Root": { "Key": [ { "EE": "A" } ] } }
{ "Root": { "Key": { } } }
{ "Root": { "Key": [{ "YY": 1}] } }
)";
// Column "EE" is created and destroyed
// Column "YY" should not be created

cudf::io::json_reader_options options =
cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.normalize_single_quotes(true)
.normalize_whitespace(false)
.mixed_types_as_string(true)
.keep_quotes(true);

auto result = cudf::io::read_json(options);

ASSERT_EQ(result.tbl->num_columns(), 1);
ASSERT_EQ(result.metadata.schema_info.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
// types
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit 476db9f

Please sign in to comment.