diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 750a593920c..344e216cdc8 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -782,8 +782,16 @@ std::vector> calculate_aligned_rowgroup_bounds( } else { // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded // Use the number of set bits in pushdown mask as size - auto bits_to_borrow = - 8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8; + auto bits_to_borrow = [&]() { + auto const parent_valid_count = d_pd_set_counts[rg_idx][parent_col_idx]; + if (parent_valid_count < previously_borrowed) { + // Borrow to make an empty rowgroup + return previously_borrowed - parent_valid_count; + } + auto const misalignment = (parent_valid_count - previously_borrowed) % 8; + return (8 - misalignment) % 8; + }(); + if (bits_to_borrow == 0) { // Didn't borrow any bits for this rowgroup previously_borrowed = 0; diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet new file mode 100644 index 00000000000..a80ce5fbd25 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet differ diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 83b7353ad89..b83b8f08a8b 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1954,3 +1954,16 @@ def test_writer_lz4(): got = pd.read_orc(buffer) assert_eq(gdf, got) + + +def test_row_group_alignment(datadir): + path = datadir / "TestOrcFile.MapManyNulls.parquet" + + expected = cudf.read_parquet(path) + + buffer = BytesIO() + expected.to_orc(buffer) + + got = cudf.read_orc(buffer) + + assert_eq(expected, got)