Skip to content

Commit

Permalink
Fix total_byte_size in Parquet row group metadata (#14802)
Browse files Browse the repository at this point in the history
The `total_byte_size` field in the row group metadata should be "[t]otal byte size of all the uncompressed column data in this row group". cuDF currently populates this field with the _compressed_ size. This PR fixes that and adds a test.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec

URL: #14802
  • Loading branch information
etseidl authored Jan 23, 2024
1 parent 42d8d78 commit ef3ce4b
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
need_sync = true;
}

row_group.total_byte_size += ck.compressed_size;
row_group.total_byte_size += ck.bfr_size;
column_chunk_meta.total_uncompressed_size = ck.bfr_size;
column_chunk_meta.total_compressed_size = ck.compressed_size;
}
Expand Down
28 changes: 28 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,33 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
EXPECT_EQ(max_value, std::string(max_val));
}

TEST_F(ParquetWriterTest, RowGroupMetadata)
{
using column_type = int;
constexpr int num_rows = 1'000;
auto const ones = thrust::make_constant_iterator(1);
auto const col =
cudf::test::fixed_width_column_wrapper<column_type>{ones, ones + num_rows, no_nulls()};
auto const table = table_view({col});

auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet");
// force PLAIN encoding to make size calculation easier
cudf::io::parquet_writer_options opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.compression(cudf::io::compression_type::ZSTD);
cudf::io::write_parquet(opts);

// check row group metadata to make sure total_byte_size is the uncompressed value
auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;
read_footer(source, &fmd);

ASSERT_GT(fmd.row_groups.size(), 0);
EXPECT_GE(fmd.row_groups[0].total_byte_size,
static_cast<int64_t>(num_rows * sizeof(column_type)));
}

// See #14772.
// zStandard compression cannot currently be used with V2 page headers due to buffer
// alignment issues.
Expand All @@ -1416,6 +1443,7 @@ TEST_F(ParquetWriterTest, ZstdWithV2Header)
EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
}

/////////////////////////////////////////////////////////////
// custom mem mapped data sink that supports device writes
template <bool supports_device_writes>
class custom_test_memmap_sink : public cudf::io::data_sink {
Expand Down

0 comments on commit ef3ce4b

Please sign in to comment.