diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 631f8adbd6d..3e587768b11 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -594,8 +594,7 @@ void make_device_json_column(device_span input, col.validity = cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); col.type = json_col_t::StringColumn; - col.child_columns.clear(); // their references should be deleted too. - col.column_order.clear(); + // destroy references of all child columns after this step, by calling remove_child_columns }; path_from_tree tree_path{column_categories, @@ -628,6 +627,19 @@ void make_device_json_column(device_span input, std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + auto name_and_parent_index = [&is_array_of_arrays, &row_array_parent_col_id, &column_parent_ids, @@ -721,6 +733,7 @@ void make_device_json_column(device_span input, auto& col = columns.at(old_col_id).get(); if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); // all its children (which are already inserted) are ignored later. } col.forced_as_string_column = true; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9d766e80094..5d790e73246 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter) } } +TEST_F(JsonReaderTest, JSONMixedTypeChildren) +{ + std::string const json_str = R"( +{ "Root": { "Key": [ { "EE": "A" } ] } } +{ "Root": { "Key": { } } } +{ "Root": { "Key": [{ "YY": 1}] } } +)"; + // Column "EE" is created and destroyed + // Column "YY" should not be created + + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(false) + .mixed_types_as_string(true) + .keep_quotes(true); + + auto result = cudf::io::read_json(options); + + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); +} + CUDF_TEST_PROGRAM_MAIN()