From 3481042d5d1a1f511515cf23f36c43620ad6663e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 9 May 2024 12:18:31 -0500
Subject: [PATCH] Upgrade `arrow` to `16` (#15703)

This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15703
---
 .../all_cuda-118_arch-x86_64.yaml             |  12 ++++-----
 .../all_cuda-122_arch-x86_64.yaml             |  12 ++++-----
 conda/recipes/cudf/meta.yaml                  |   2 +-
 conda/recipes/libcudf/conda_build_config.yaml |   2 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |   2 +-
 dependencies.yaml                             |  24 ++++++++----------
 python/cudf/cudf/io/parquet.py                |   5 ----
 .../tests/data/parquet/usec_timestamp.parquet | Bin 1128 -> 2323 bytes
 python/cudf/cudf/tests/test_dataframe.py      |  14 +---------
 python/cudf/cudf/tests/test_index.py          |   8 +-----
 python/cudf/cudf/tests/test_parquet.py        |   4 +--
 python/cudf/cudf/utils/ioutils.py             |   3 ++-
 python/cudf/pyproject.toml                    |   6 ++---
 python/cudf_kafka/pyproject.toml              |   2 +-
 .../dask_cudf/io/tests/test_parquet.py        |   4 +--
 15 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7a5fef9f25e..48699b81eed 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
@@ -92,7 +92,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 48453e18bb0..d06a727f331 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
@@ -90,7 +90,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ddcadfd1570..24210830ada 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==14.0.2.*
+    - pyarrow ==16.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index ba5e96fb6cf..61ffcf3c3de 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==14.0.2"
+  - "==16.0.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 892056959c8..70283efbd79 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.2
+      16.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index 1508656471d..7fe67817f73 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -266,7 +266,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.2.*
+          - pyarrow==16.0.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -312,27 +312,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==14.0.2.*
-          - libarrow-dataset==14.0.2.*
-          - libarrow==14.0.2.*
-          - libparquet==14.0.2.*
+          - libarrow-acero==16.0.0.*
+          - libarrow-dataset==16.0.0.*
+          - libarrow==16.0.0.*
+          - libparquet==16.0.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow libarrow 14.0.0 due to a CVE
-          - libarrow-acero>=14.0.1,<15.0.0a0
-          - libarrow-dataset>=14.0.1,<15.0.0a0
-          - libarrow>=14.0.1,<15.0.0a0
-          - libparquet>=14.0.1,<15.0.0a0
+          - libarrow-acero>=16.0.0,<17.0.0a0
+          - libarrow-dataset>=16.0.0,<17.0.0a0
+          - libarrow>=16.0.0,<17.0.0a0
+          - libparquet>=16.0.0,<17.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow pyarrow 14.0.0 due to a CVE
-          - pyarrow>=14.0.1,<15.0.0a0
+          - pyarrow>=16.0.0,<17.0.0a0
   cuda_version:
     specific:
       - output_types: conda
@@ -631,7 +629,7 @@ dependencies:
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
-          - &transformers transformers==4.38.1
+          - &transformers transformers==4.39.3
           - tzdata
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index e7f1ad0751f..dd1e59acaaa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -993,15 +993,10 @@ def to_parquet(
         if index is None:
             index = True
 
-        # Convert partition_file_name to a call back
-        if partition_file_name:
-            partition_file_name = lambda x: partition_file_name  # noqa: E731
-
         pa_table = df.to_arrow(preserve_index=index)
         return pq.write_to_dataset(
             pa_table,
             root_path=path,
-            partition_filename_cb=partition_file_name,
             partition_cols=partition_cols,
             *args,
             **kwargs,
diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
index 20ef3cc5578426c7fa4fabe32692f43d3cd1c66c..efde6ff11bf97254c84827f1fe2703035fae4c34 100644
GIT binary patch
literal 2323
zcmcguL2u$l6dnj6v}z=}!U!Y|$l67fO0qy8Evxh}kN~EH1hTP#)e3oR17>YQz=p6<
zj=l9z^|Z%TkL{(uqsN|BJyz<ir=EK38#^W;B!?c_NyamN@6GqV_uf2<Q`}55J|@mQ
zt-*qxAQU|K@cSPZv2Qj}<=s!IG^`9_LQEtHk09e`{IN*D)kFfWZ5Vy_fD|G0@l!86
zLmtoOkKV*@o*3hc=8!)ajd(=H8xDnk@q~Z%Br1u(zk!V=DOtu#sRdy{2;Sd&7`X-{
zX%MFfZF`VsJAk|}H(>f;Zo>4#+}aKt1mF-IA2|y}f+XAx($z#<Q$&MYmjXcWzzNc~
z8O*$#y#q<&=gr$tKkar^Lsl-knle+gZmQe0L;v6nzxO7-_a(mDfZ*4GA943k@X>=D
ziuk9}K+#3U2ibV$g#<qf)xLcaOWG0IqIDuD9E!%G$^Wo2|7#|~Kls9-L?HavMhKy3
zGU6YLhAd7ae{Wo|P6?!uX6eS1eGz}fu9by-os6@Ng;O!bHoRgDQCHxIVQCsz^i)NY
zyQ?zCQP*6IsUd3_sx*=n7OV|RA1{`Q5g96yN#(j?in3^mV1C6a2AHcxpPoH`S=#xM
zy}rIBv&w=M>G2g3j_4i-09Q;Eh_8`wHT&6D*7Wr~GBn8RByd^{aMj;>>cGi8|Egi`
zXV;xC#di{5`?TYoD55!<*g4TuG7%>Wnv(HCoJ>Y@3N}+FQ*~reX?lM*wVPsrtn2!Y
zGP4VR>@|vff}70~;f2#2kw3Sd)A4z~h%s)!q~mLhv$)X5C5-#H*tcbc!is)Qg-1*M
zA*Scy9fpl!>p6_)7|y|JgTU$<wP9D7w}srBcIz19(>|CLzTkAt>Xi9xD^s753U3nQ
zSUph-gwygp5o|lHL64OAOh>rv9hdSwPD@R>Vs%I|muVv8TpLYVaQj@z1iajU2<H^T
zl7-7@t%%i*u4$y)>=C_WcFw@AoJ*7PoZ{T@srTfHHn(-fYMSU8;5Ri+o%aDRO<ILq
zYp<D;$`lu?BvZZU2=$bsPiu0y1Ti%S+;G%a7FCVt^;xG3aZygiD$gIa&68p^-{V)f
zzoquV-c()kDpm&$W@ilXzG-FLSg_hY;~wKneFF!L#zt)*?(|&Y^C_iO(5MeiZw}-#
zpAv<%Rv$Kr+^WkDaVG{pB^BUUxx?u;SGp+h(@rUGwE@S*eXK?_Z$@E2z^|0P%R{r|
z!}d+y*7FMWPSuAEmDY?1e5x%_-bc$?y7`lI9_0DOO}?k1P+hcJ)tt@ADCA(jAbXOH
zBkTM^%4jp9(pqqrbu~M$4YArgTj7}Lw5o4KA#d3jL(V!@vkT5h(d+A6<&t~<F`k`5
z|IpV4XW#sqZREBR+;_T#yZX@mrKZq0&;Wht2HGb#<J?{wxX_^bGR6`<q$21Nr^#U7
z7}-uhNnh;ea$7#4{R?<)x;mqeze17tSe#r~in$|=$2;~%B!2Ueh&!K;XflQn{G-wq
K{BD4sL;nCsZEHdR

delta 361
zcmbO%^n!ywz%j^BltolQRK*8Ku}tKbXJnbEFRjkNz`&)#1SCZm8Ch2`scmAE5oM7y
zWno~GlweF0WfBt+Tf``~YO*|&As+(+Q0oFl(G(w15ug!Q>{(ev*+khUPh=8T0Gc3R
z12Kn%Rlq2uYz9z7?EnkRw8^<llH41Z#C|ZUonlry!35N#BLg&$Nz%rYL4vI~IU_YU
zQIrK}gDMlyVpWmJFBlcX@wkSUpz$IM3}TCz#Y7k+WhB@N67y0LizN<je!(Qk$e1^o
ziKQ<9A{JkkT2!2wpQmJ{pk$zDWTI!JRLdZtRFGIySeja*n_N(!09B!2sAsBYs2l9(
V7!d5?A0i3z01z=e2KpNmz5p+6N@V~5

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 20e9f41de63..8550bc91253 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
-    expect = pd.Series(pa_chunk_array.to_pandas())
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
+    expect = pa_chunk_array.to_pandas()
     got = cudf.Series(pa_chunk_array)
 
     assert_eq(expect, got)
@@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     )
 
     expect = pa_table.to_pandas()
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
     got = cudf.DataFrame.from_arrow(pa_table)
 
     assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 0b252cec4b8..3cc6bfdbdc2 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-    if gdi.dtype == cudf.dtype("datetime64[s]"):
-        # Arrow bug:
-        # https://github.com/apache/arrow/issues/33321
-        # arrow cannot convert non-nanosecond
-        # resolution to appropriate type in pandas.
-        # Hence need to type-cast.
-        expected_index = expected_index.astype(gdi.dtype)
+
     assert_eq(expected_index, gdi)
 
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 1e175f5ff0d..cf3c0e7f7a0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
     # Because of this, we aren't using PyArrow as a reference for testing our
     # row-group selection method since the only way to only select row groups
     # with PyArrow is with the method we use and intend to test.
-    tbl_filtered = pq.read_table(
-        fname, filters=[("1", ">", 60)], use_legacy_dataset=False
-    )
+    tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])
 
     assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
     print(len(df_filtered))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6bd7558d322..9c7c687a6ed 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -247,7 +247,8 @@
     File name to use for partitioned datasets. Different partitions
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
-    will be used for each file.
+    will be used for each file. This parameter is only supported by 'cudf'
+    engine, and will be ignored by other engines.
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index fc3a243572f..4b57bcd018a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.1,<15.0.0a0",
+    "pyarrow>=16.0.0,<17.0.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
@@ -63,7 +63,7 @@ test = [
     "pytest<8",
     "scipy",
     "tokenizers==0.15.2",
-    "transformers==4.38.1",
+    "transformers==4.39.3",
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index eb48852202a..787dd8a97d7 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 2c44f192612..39800145585 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
     pdf = ddf2.compute()
     pdf.to_parquet(fn, engine="pyarrow")
     read_df = dask_cudf.read_parquet(fn)
-    # Workaround until following issue is fixed:
-    # https://github.com/apache/arrow/issues/33321
-    dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
+    dd.assert_eq(ddf2, read_df.compute())
 
 
 @pytest.mark.parametrize("index", [False, None])