From 83f0ae02663e036f5fa52124561f36c646ae0918 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:31:01 -0600 Subject: [PATCH] Fix write_json failure for zero columns in table/struct (#17414) Closes #17413 num_rows are passed to ensure empty`{}` is created for zero columns. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17414 --- cpp/src/io/json/write_json.cu | 64 +++++++++++++++++++---------- python/cudf/cudf/tests/test_json.py | 6 +++ 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 8156258c810..a4885d59cc5 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -244,6 +244,7 @@ struct validity_fn { * * @param strings_columns Table of strings columns * @param column_names Column of names for each column in the table + * @param num_rows Number of rows in the table * @param row_prefix Prepend this string to each row * @param row_suffix Append this string to each row * @param value_separator Separator between values @@ -255,6 +256,7 @@ struct validity_fn { */ std::unique_ptr struct_to_strings(table_view const& strings_columns, column_view const& column_names, + size_type const num_rows, string_view const row_prefix, string_view const row_suffix, string_view const value_separator, @@ -268,8 +270,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, auto const num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns == column_names.size(), "Number of column names should be equal to number of columns in the table"); - auto const strings_count = strings_columns.num_rows(); - if (strings_count == 0) // empty begets empty + if (num_rows == 0) // empty begets empty return make_empty_column(type_id::STRING); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), @@ -277,31 +278,46 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, [](auto const& c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); auto constexpr strviews_per_column = 3; // (for each "column_name:", "value", "separator") - auto const num_strviews_per_row = strings_columns.num_columns() * strviews_per_column + 1; + auto const num_strviews_per_row = strings_columns.num_columns() == 0 + ? 2 + : (1 + strings_columns.num_columns() * strviews_per_column); // e.g. {col1: value, col2: value, col3: value} = 1 + 3 + 3 + (3-1) + 1 = 10 auto tbl_device_view = cudf::table_device_view::create(strings_columns, stream); auto d_column_names = column_device_view::create(column_names, stream); // Note for future: chunk it but maximize parallelism, if memory usage is high. - auto const total_strings = num_strviews_per_row * strings_columns.num_rows(); - auto const total_rows = strings_columns.num_rows() * strings_columns.num_columns(); + auto const total_strings = num_strviews_per_row * num_rows; + auto const total_rows = num_rows * strings_columns.num_columns(); rmm::device_uvector d_strviews(total_strings, stream); - struct_scatter_strings_fn scatter_fn{*tbl_device_view, - *d_column_names, - strviews_per_column, - num_strviews_per_row, - row_prefix, - row_suffix, - value_separator, - narep.value(stream), - include_nulls, - d_strviews.begin()}; - // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(total_rows), - scatter_fn); + if (strings_columns.num_columns() > 0) { + struct_scatter_strings_fn scatter_fn{*tbl_device_view, + *d_column_names, + strviews_per_column, + num_strviews_per_row, + row_prefix, + row_suffix, + value_separator, + narep.value(stream), + include_nulls, + d_strviews.begin()}; + // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(total_rows), + scatter_fn); + } else { + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [d_strviews = d_strviews.begin(), row_prefix, row_suffix, num_strviews_per_row] __device__( + auto idx) { + auto const this_index = idx * num_strviews_per_row; + d_strviews[this_index] = row_prefix; + d_strviews[this_index + num_strviews_per_row - 1] = row_suffix; + }); + } if (!include_nulls) { // if previous column was null, then we skip the value separator rmm::device_uvector d_str_separator(total_rows, stream); @@ -341,7 +357,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, // gather from offset and create a new string column auto old_offsets = strings_column_view(joined_col->view()).offsets(); - rmm::device_uvector row_string_offsets(strings_columns.num_rows() + 1, stream, mr); + rmm::device_uvector row_string_offsets(num_rows + 1, stream, mr); auto const d_strview_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type([num_strviews_per_row] __device__(size_type const i) { return i * num_strviews_per_row; @@ -353,7 +369,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, row_string_offsets.begin()); auto chars_data = joined_col->release().data; return make_strings_column( - strings_columns.num_rows(), + num_rows, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), std::move(chars_data.release()[0]), 0, @@ -677,6 +693,7 @@ struct column_to_strings_fn { auto col_string = operator()(child_it, child_it + column.num_children(), children_names, + column.size(), struct_row_end_wrap.value(stream_)); col_string->set_null_mask(cudf::detail::copy_bitmask(column, stream_, mr_), column.null_count()); @@ -688,6 +705,7 @@ struct column_to_strings_fn { std::unique_ptr operator()(column_iterator column_begin, column_iterator column_end, host_span children_names, + size_type num_rows, cudf::string_view const row_end_wrap_value) const { auto const num_columns = std::distance(column_begin, column_end); @@ -733,6 +751,7 @@ struct column_to_strings_fn { // return struct_to_strings(str_table_view, column_names_view, + num_rows, struct_row_begin_wrap.value(stream_), row_end_wrap_value, struct_value_separator.value(stream_), @@ -908,6 +927,7 @@ void write_json_uncompressed(data_sink* out_sink, auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names, + sub_view.num_rows(), d_line_terminator_with_row_end.value(stream)); // Needs line_terminator at the end, to separate from next chunk diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 47976fc4bac..b48be6b2c2f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -277,6 +277,12 @@ def test_cudf_json_writer_read(gdf_writer_types): """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", """{"a":{"L": [{}, {}]}, "b":1.1}\n""", ), + # empty structs + ("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""), + ( + """{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""", + """{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""", + ), ], ) def test_cudf_json_roundtrip(jsonl_string, expected):