Upgrade arrow to 16 (#15703)

This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: #15703
rapidsai · May 9, 2024 · 3481042 · 3481042
1 parent 69fe213
commit 3481042
Show file tree

Hide file tree

Showing 15 changed files with 36 additions and 64 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
@@ -92,7 +92,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:

diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
@@ -90,7 +90,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==14.0.2.*
+    - pyarrow ==16.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}

diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==14.0.2"
+  - "==16.0.0"
 
 dlpack_version:
   - ">=0.8,<1.0"

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.2
+      16.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -266,7 +266,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.2.*
+          - pyarrow==16.0.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -312,27 +312,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==14.0.2.*
-          - libarrow-dataset==14.0.2.*
-          - libarrow==14.0.2.*
-          - libparquet==14.0.2.*
+          - libarrow-acero==16.0.0.*
+          - libarrow-dataset==16.0.0.*
+          - libarrow==16.0.0.*
+          - libparquet==16.0.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow libarrow 14.0.0 due to a CVE
-          - libarrow-acero>=14.0.1,<15.0.0a0
-          - libarrow-dataset>=14.0.1,<15.0.0a0
-          - libarrow>=14.0.1,<15.0.0a0
-          - libparquet>=14.0.1,<15.0.0a0
+          - libarrow-acero>=16.0.0,<17.0.0a0
+          - libarrow-dataset>=16.0.0,<17.0.0a0
+          - libarrow>=16.0.0,<17.0.0a0
+          - libparquet>=16.0.0,<17.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow pyarrow 14.0.0 due to a CVE
-          - pyarrow>=14.0.1,<15.0.0a0
+          - pyarrow>=16.0.0,<17.0.0a0
   cuda_version:
     specific:
       - output_types: conda
@@ -631,7 +629,7 @@ dependencies:
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
-          - &transformers transformers==4.38.1
+          - &transformers transformers==4.39.3
           - tzdata
     specific:
       - output_types: conda

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -993,15 +993,10 @@ def to_parquet(
         if index is None:
             index = True
 
-        # Convert partition_file_name to a call back
-        if partition_file_name:
-            partition_file_name = lambda x: partition_file_name  # noqa: E731
-
         pa_table = df.to_arrow(preserve_index=index)
         return pq.write_to_dataset(
             pa_table,
             root_path=path,
-            partition_filename_cb=partition_file_name,
             partition_cols=partition_cols,
             *args,
             **kwargs,

diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
-    expect = pd.Series(pa_chunk_array.to_pandas())
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
+    expect = pa_chunk_array.to_pandas()
     got = cudf.Series(pa_chunk_array)
 
     assert_eq(expect, got)
@@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     )
 
     expect = pa_table.to_pandas()
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
     got = cudf.DataFrame.from_arrow(pa_table)
 
     assert_eq(expect, got)

diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-    if gdi.dtype == cudf.dtype("datetime64[s]"):
-        # Arrow bug:
-        # https://github.com/apache/arrow/issues/33321
-        # arrow cannot convert non-nanosecond
-        # resolution to appropriate type in pandas.
-        # Hence need to type-cast.
-        expected_index = expected_index.astype(gdi.dtype)
+
     assert_eq(expected_index, gdi)
 
 

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
     # Because of this, we aren't using PyArrow as a reference for testing our
     # row-group selection method since the only way to only select row groups
     # with PyArrow is with the method we use and intend to test.
-    tbl_filtered = pq.read_table(
-        fname, filters=[("1", ">", 60)], use_legacy_dataset=False
-    )
+    tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])
 
     assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
     print(len(df_filtered))

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -247,7 +247,8 @@
     File name to use for partitioned datasets. Different partitions
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
-    will be used for each file.
+    will be used for each file. This parameter is only supported by 'cudf'
+    engine, and will be ignored by other engines.
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.1,<15.0.0a0",
+    "pyarrow>=16.0.0,<17.0.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
@@ -63,7 +63,7 @@ test = [
     "pytest<8",
     "scipy",
     "tokenizers==0.15.2",
-    "transformers==4.38.1",
+    "transformers==4.39.3",
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [

diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
     pdf = ddf2.compute()
     pdf.to_parquet(fn, engine="pyarrow")
     read_df = dask_cudf.read_parquet(fn)
-    # Workaround until following issue is fixed:
-    # https://github.com/apache/arrow/issues/33321
-    dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
+    dd.assert_eq(ddf2, read_df.compute())
 
 
 @pytest.mark.parametrize("index", [False, None])