From 9a0612b3add9c76ea8cb45cc230b75b2474d91f7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 21 May 2024 17:09:00 -0700 Subject: [PATCH] Fix row group alignment in ORC writer (#15789) Closes https://github.com/rapidsai/cudf/issues/15775 ORC writer encodes null mask bits in multiples of eight to avoid issues with other readers reading partial encoded bytes. When this does not align with row groups, the null mask encode boundaries are moved to align to multiples of eight. There was a bug in the alignment code that caused a pointless shift by 8 bits and, then, issues in encode. This PR fixes the unnecessary shift. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15789 --- cpp/src/io/orc/writer_impl.cu | 12 ++++++++++-- .../data/orc/TestOrcFile.MapManyNulls.parquet | Bin 0 -> 6353 bytes python/cudf/cudf/tests/test_orc.py | 13 +++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 750a593920c..344e216cdc8 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -782,8 +782,16 @@ std::vector> calculate_aligned_rowgroup_bounds( } else { // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded // Use the number of set bits in pushdown mask as size - auto bits_to_borrow = - 8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8; + auto bits_to_borrow = [&]() { + auto const parent_valid_count = d_pd_set_counts[rg_idx][parent_col_idx]; + if (parent_valid_count < previously_borrowed) { + // Borrow to make an empty rowgroup + return previously_borrowed - parent_valid_count; + } + auto const misalignment = (parent_valid_count - previously_borrowed) % 8; + return (8 - misalignment) % 8; + }(); + if (bits_to_borrow == 0) { // Didn't borrow any bits for this rowgroup previously_borrowed = 0; diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a80ce5fbd25837e1e25991dd14e9d2bed849c651 GIT binary patch literal 6353 zcmaLcd0Z36qX6)5#So7I0)~hRDT$i}5fm>(E&)SSLo6Vg;H-k%3FTFzTf-f{oc!GKiSFb?99&0?#z5Q zsc~uHKCV9AKHf9vohyUk#9%Pu`@Z6Q7(UO#=`Fnp}@7$?cZ$5DQoO(;gHAYGC$7f&`LfBlfm}>KfDtMi@ZfVRStQ2JCIkW_q0g2JQCx}hfXxKB6pCge3b?vTWmoF5Sg@-= zhm(YoM_;VU%T4XDK?(@x)Ud4vp&*mW3XnmGlotxQ8FZPX(f}j`T9IiZK$1*FAifD4 zQlAD@=qaM!!j#z|2M=ba`zbUmDWXe&DJ&UvLjyfAHZzl5;VCABSrtBMNpz2s9wxmz zK z)EI&|u!` z2?*R(QOm=eGJq0yPbDiczJPK#f()8ufB}V^4Wn6n*jJ4y$a*5Ks#>6Bah#KJy3e$* z4+vl;vn&+fFV!6gjW%-yh1Mu6vW#?{5q4t(Ukd^3hImo}DFijP3M))eFi8moAgCq_ z)f**%8E7FfMH*4XQ^+VTTa_TtX4w}xxDsnRM_+5MmCG`<+!|*Atj4ogzyK&1HDTIp z)B@u+Dx0!V04E5egv9t>T*PJ*<;qCdoWz36kdEmr)M#Ldss^T*Z~%R-oYE*~8(^hW z133sU9MaTa*+h-Q0Z4&Nk8?0_G6}T4IO$P#Fi5MlsOSzvZN z0U0oFrHqXe4nmQLr~<^^mKs2JKTkrq(lv7y8}zu1fODyICzUdR@-;bx1T3_e)F=m} zVi>ntLb+240wwHPmQI2O0fHyao{cC85yL};#yW)Y3N5MN#*!MbM2BdIY}`)=bQ$Jq zV3i_5N~%di5==>{Kq!GR6ABYtO>R0+LJCZ%G>A;D?qm|A0wza>QfUW)6q-QER0ddP zQ7bVj&1B)2RXCZ;)-e%{n1|UJVjaff==>nU&W25pK_-SU#EaltF+ml{m(|<2sv4#l zP5`yQBtl`bTIL~?AgPN~dU_xLK+KilHmuqb3u7oiUAZj!nlJ%iBN_TC%p}2Ts&O3C z^Q(lo8qgzJ6$Vk>L0XH_2&^{5kN~A(HZLzfF|gPwfJuwdlS($9zziyrLJ8o?8X)2T z*e8~xM2ILrh*FdiLbPZNuCY2HY(#HG0fvh$c$|vi>MIr!QnN;El!!@ltVoocL_m~` zn<1#w3P{YX#;w&Pz)dK-$}U2=d`B{gSEB|CJKh3drI%GjA{J*Wx(JXe2M)xz9e8L& zY4kK{MKh^DaW*Q4>J0%zkU3Cak1YbuI#NTz%vgX*GzJXzaMb_^lNd}^S$s0&8>rya z6Kt<~bCtG|NlJtQR1M@_x!gpXA{S(<;;m-F#Ad@>07yG&%K{85WuvEmoB}E&fs`dz z5vf{IfWuYUOo0IA!%Udu^MppAwOH+_#KKdFZk!az1XRv_v2X zWPAEKq%e^vB=qzc;$;}=AQ74+ci<)CMTz6S7MN)O zn35^V5;zd|TA4zd4SVwGp7d1Ia?yIkP>8v^(W4fqQr3d{8eTRLL{x&>K#yvs#SVPo z3|N`X=W1(V5eQNNx^_4mD~AOfRAn*1q&e4FNa1dXwh9x`k6V%jrAHwE;Q&ixnmw}s z#U?2n;>05kok|Gfgvr@S-In-^ZQ0~D&ZE@iJMBfvD{rQq5LG7LdAcR@c*M{v?_aSm zML3`24Q~HzSTee)FnF66w&YdYt@f@C=I3R9g-Fy-JdP|{x8u9|!TY@{(FN4Ecs@HUUH}<#BNAuK=Z;op$zkRm(-pv~# zUYqFW<4F$~P2InqE4n>27F&5?`hg_jZ!Yb@yNg&n{^Xk3tEK82{gy4%Emrz+S2Rwz zT9|wFY&U=NuE#NtUGSLDp!Uosqgq?n4tsL6DkMO={yOuwOW!YfXq?NP8}{wCA2K&b z2-NbK{0na;b)262$1krFrUkUl&l?(U+j~VYrlq~Y>AB_Io&1q&2cOz%&p$DDU-X-z z?iayN2TcxI+rM+ZsXm9hR_(J?+5V51k(K^Er=1^9im*!WUs2TdANq{#IsBy4$t~k7 zjx_r~n)uPz5Eh_dIan-)=Ww>3v;^=a9`Ki$zCq`e7%M%_NG-SAJ zYMyb>hwAlby9T~Cs$;#IveHbm2Kyd4#9$M%r?S9MiG^61R1 zyB2)<^W8U%nuc%ZvOKRwEF%kabIe^U*GZSOzsTc7t=X`5Tx;*__MZ81FN5$6y=tc_ zTTlO8lbz%I2CWny3CRpPeeY=@gVqPjhc&~;xlbs4AKE6@N7CE>W0Or6)}UQ&`G3g_ zhp!lfbDKtA+7Fl1w@tU5r){$TB3sb^JAd|p>vv+UBCX7<_S=&qces5OT#4KGs9J-Mo9Ou;k8`Qvw^^=qS1-Q=9X zciz3cH|1m<+Mu1QUbW#jt7zXFUU-tg&G`BEG+?xqQfc_*Gd?bY9H6&UJV8m@h`tHn|;s zIb8U-@SWPf;Z@_=p2ktWoo^lb-yDPdyw5$SZE`zlwAQ%ly}In|xo%TkdgJoIwnsSw zS^eC1N*+W--}Ik$jsN-rf6kwyQVLt6&QF_?i_bB>-ZZb|A#vV&?pfL<`$GzhDR+jB zE;JtC)!iL zfyW#*RNOzA^nkrzJ-p-(Z0_05+l{H++@rh8LsuU*?!6Pd{c!cEJ;Q%zT~)cfc!J*< zLEGfvw9qlgmf7nAy>9-zt)SMabkd%I>CJz1mfD_q@JE~;@c35$1)YNN#`pRKE2eCE zmH&FozCW*m;it}CT(Bl@a&^_x!};gu4+wS@w7vS=sT>p4Y*h_*Qy)1Hl?NU4ulcIM zFlu~e*Zi=esKmWq=tQ^6ZJE#W>ze1i@3YCS7t>@*kE(qW{F^S#nOJe@`1|@B{fh0- zk+H#p(uXO2U@b3bOPJiXW9B(GG~DwKHgj&DO&*P!{|}!z(;_$No_k!qZyek>mss4Q(ZIjT%5u&dX%*+4rb~n~N3pha%ujl=n((jDeo-zd#;b6(mp+R@TZT8aC2#MKe+e)v0uKHhV{4v|Al=0 z{PFG2Iq{D_pNv<_+qO*klEP1!-wyxi<~FP>zJ2zAw;x^5S3`FPs|!v~Ji26c&nboU zqI%)%0n@6d(Kh+@!Oe(+k*{kHt@>TRCX>w1nPPr?<6LRQPp3MfXHIVWwx|1_Hu=)? zdi8^2MxikwetmS2oqZi)*c(aE9gYLfhoAdHrag7h%kq1HS7SN86-#lCk4@#glVC z=UEnKq;D`zc4O}wa^pR)*G+9&7`1YI&7TzwlaKcVKc<8uPXuk8e^zf?_!l$qyJI)= z(i&TNdBvxOvP%<2uFrb2=7T7bvtAO(?!_Q0xvADnN0PI4(q)PzH%%zg-{)+fHS_sNt?!19i^f0~(l3-;D$W?wa{&7K z>W$XdYt&)69tR4ZKAN|ewn@Gy`L~22SXOp87>$ZBYl6|$a)A1-Tv2bZvig)|Q6L&8VV zC!gKol2WI?-|+}8IhF2InxAld$hEh^_mM)&HT7oB)6_e+hF9l&asRS%*Tr3L&am1f zn?lu}zxklJl!IkFpO9+i)tt6E&kie?|mp4biQA~x|Ffo4>M++-^$`_7#7uO|0>ut@$%wXA$JM~H}dap zpV0bYoUFcMX!%B_IXGeWuf1DloP$67DD#*yZ*A(bQvLUPj=VTAYedn#j;+-J|qWjO)muZ{a z>=;bj%ARepGiJ<8YJ|SXMr4dX{nNf-h{L}cb{GM^m(b{^@)_% zeKwhqR#wp5eKxZzJSBUxJfyxZ>?m!MOIPkJ>UGvNX^w^Vi#>TUr{%X>!5*$Zyo|at zukKp+;;G3|L&8`3Upd;}>x#MRr_=q8EgUuBqIw@~lM#s>-{gV?rBfTOcbfY5kLJGo zi+s>@AnPNeo9fVijK+^Qm##@Twm7-KE@wA>i17)ZZz00 zNZQ9PogUZk$>6c=vlr4fIbNM|&=UD}1%K!DF72-Hec}B*hdGwV>`EMURKEdh8f=}k zy-+>-*U^=?kI^;>-DrwjeDIA*bZ$@ydH0~aceTuL`xWDdsj%(gla&{m5AWKNcq;yx zrFZd^mx$!ktLTLa#iY@iS08`9cSK)vy3Zy%+2N9pIlIP14xf5xLA>oy|4$ENXYzh+ z9?u``a_xufd0$+jyc{u~*3N2fzVK?(&|Gq_`rd)&e5jW>Z;5JmLBG(Z$0PK2+9n@2Cl+-&{dpiT^YYVM`Ta!mM;?WF+_GNMGw;{o7p|Rk(4-sZ_SZdoG4cM#?P<*0o>?cKu=ni6qc~HM zw;Ps@ih13+#o6gB#PD(Un=fJ(FILKzN`0IeGI^|A-n?m zhg6nNm(v#!!u#)Q<3+A1FJDm3;Bfv|bymuYidBUL9JjHd5usxk9G}H2auor8#|Dgv Z2n`z_8W!dkLU(CjuX@sro=X3j@n1^C^eO-V literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 83b7353ad89..b83b8f08a8b 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1954,3 +1954,16 @@ def test_writer_lz4(): got = pd.read_orc(buffer) assert_eq(gdf, got) + + +def test_row_group_alignment(datadir): + path = datadir / "TestOrcFile.MapManyNulls.parquet" + + expected = cudf.read_parquet(path) + + buffer = BytesIO() + expected.to_orc(buffer) + + got = cudf.read_orc(buffer) + + assert_eq(expected, got)