From 3481042d5d1a1f511515cf23f36c43620ad6663e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 9 May 2024 12:18:31 -0500 Subject: [PATCH] Upgrade `arrow` to `16` (#15703) This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/15703 --- .../all_cuda-118_arch-x86_64.yaml | 12 ++++----- .../all_cuda-122_arch-x86_64.yaml | 12 ++++----- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- cpp/cmake/thirdparty/get_arrow.cmake | 2 +- dependencies.yaml | 24 ++++++++---------- python/cudf/cudf/io/parquet.py | 5 ---- .../tests/data/parquet/usec_timestamp.parquet | Bin 1128 -> 2323 bytes python/cudf/cudf/tests/test_dataframe.py | 14 +--------- python/cudf/cudf/tests/test_index.py | 8 +----- python/cudf/cudf/tests/test_parquet.py | 4 +-- python/cudf/cudf/utils/ioutils.py | 3 ++- python/cudf/pyproject.toml | 6 ++--- python/cudf_kafka/pyproject.toml | 2 +- .../dask_cudf/io/tests/test_parquet.py | 4 +-- 15 files changed, 36 insertions(+), 64 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 7a5fef9f25e..48699b81eed 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -36,15 +36,15 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -66,7 +66,7 @@ dependencies: - pip - pre-commit - ptxcompiler -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 @@ -92,7 +92,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 48453e18bb0..d06a727f331 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -37,13 +37,13 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==14.0.2.* -- libarrow-dataset==14.0.2.* -- libarrow==14.0.2.* +- libarrow-acero==16.0.0.* +- libarrow-dataset==16.0.0.* +- libarrow==16.0.0.* - libcufile-dev - libcurand-dev - libkvikio==24.6.* -- libparquet==14.0.2.* +- libparquet==16.0.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.6.* - make @@ -63,7 +63,7 @@ dependencies: - pandoc - pip - pre-commit -- pyarrow==14.0.2.* +- pyarrow==16.0.0.* - pydata-sphinx-theme!=0.14.2 - pynvjitlink - pytest-benchmark @@ -90,7 +90,7 @@ dependencies: - streamz - sysroot_linux-64==2.17 - tokenizers==0.15.2 -- transformers==4.38.1 +- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 - pip: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index ddcadfd1570..24210830ada 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -65,7 +65,7 @@ requirements: - setuptools - dlpack >=0.8,<1.0 - numpy 1.23 - - pyarrow ==14.0.2.* + - pyarrow ==16.0.0.* - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index ba5e96fb6cf..61ffcf3c3de 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -20,7 +20,7 @@ cmake_version: - ">=3.26.4" libarrow_version: - - "==14.0.2" + - "==16.0.0" dlpack_version: - ">=0.8,<1.0" diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 892056959c8..70283efbd79 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 14.0.2 + 16.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/dependencies.yaml b/dependencies.yaml index 1508656471d..7fe67817f73 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -266,7 +266,7 @@ dependencies: - cython>=3.0.3 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - pyarrow==14.0.2.* + - pyarrow==16.0.0.* - output_types: conda packages: - scikit-build-core>=0.7.0 @@ -312,27 +312,25 @@ dependencies: packages: # Hard pin the Arrow patch version used during the build. This must # be kept in sync with the version pinned in get_arrow.cmake. - - libarrow-acero==14.0.2.* - - libarrow-dataset==14.0.2.* - - libarrow==14.0.2.* - - libparquet==14.0.2.* + - libarrow-acero==16.0.0.* + - libarrow-dataset==16.0.0.* + - libarrow==16.0.0.* + - libparquet==16.0.0.* libarrow_run: common: - output_types: conda packages: # Allow runtime version to float up to minor version - # Disallow libarrow 14.0.0 due to a CVE - - libarrow-acero>=14.0.1,<15.0.0a0 - - libarrow-dataset>=14.0.1,<15.0.0a0 - - libarrow>=14.0.1,<15.0.0a0 - - libparquet>=14.0.1,<15.0.0a0 + - libarrow-acero>=16.0.0,<17.0.0a0 + - libarrow-dataset>=16.0.0,<17.0.0a0 + - libarrow>=16.0.0,<17.0.0a0 + - libparquet>=16.0.0,<17.0.0a0 pyarrow_run: common: - output_types: [conda, requirements, pyproject] packages: # Allow runtime version to float up to minor version - # Disallow pyarrow 14.0.0 due to a CVE - - pyarrow>=14.0.1,<15.0.0a0 + - pyarrow>=16.0.0,<17.0.0a0 cuda_version: specific: - output_types: conda @@ -631,7 +629,7 @@ dependencies: packages: - msgpack - &tokenizers tokenizers==0.15.2 - - &transformers transformers==4.38.1 + - &transformers transformers==4.39.3 - tzdata specific: - output_types: conda diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index e7f1ad0751f..dd1e59acaaa 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -993,15 +993,10 @@ def to_parquet( if index is None: index = True - # Convert partition_file_name to a call back - if partition_file_name: - partition_file_name = lambda x: partition_file_name # noqa: E731 - pa_table = df.to_arrow(preserve_index=index) return pq.write_to_dataset( pa_table, root_path=path, - partition_filename_cb=partition_file_name, partition_cols=partition_cols, *args, **kwargs, diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet index 20ef3cc5578426c7fa4fabe32692f43d3cd1c66c..efde6ff11bf97254c84827f1fe2703035fae4c34 100644 GIT binary patch literal 2323 zcmcguL2u$l6dnj6v}z=}!U!Y|$l67fO0qy8Evxh}kN~EH1hTP#)e3oR17>YQz=p6< zj=l9z^|Z%TkL{(uqsN|BJyzf;Zo>4#+}aKt1mF-IA2|y}f+XAx($z#=D ziuk9}K+#3U2ibV$g#G2g3j_4i-09Q;Eh_8`wHT&6D*7Wr~GBn8RByd^{aMj;>>cGi8|Egi` zXV;xC#di{5`?TYoD55!<*g4TuG7%>Wnv(HCoJ>Y@3N}+FQ*~reX?lM*wVPsrtn2!Y zGP4VR>@|vff}70~;f2#2kw3Sd)A4z~h%s)!q~mLhv$)X5C5-#H*tcbc!is)Qg-1*M zA*Scy9fpl!>p6_)7|y|JgTU$|CLzTkAt>Xi9xD^s753U3nQ zSUph-gwygp5o|lHL64OAOh>rv9hdSwPD@R>Vs%I|muVv8TpLYVaQj@z1iajU2=C_WcFw@AoJ*7PoZ{T@srTfHHn(-fYMSU8;5Ri+o%aDRO{(ev*+khUPh=8T0Gc3R z12Kn%Rlq2uYz9z7?EnkRw8^", 60)], use_legacy_dataset=False - ) + tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64) print(len(df_filtered)) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6bd7558d322..9c7c687a6ed 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -247,7 +247,8 @@ File name to use for partitioned datasets. Different partitions will be written to different directories, but all files will have this name. If nothing is specified, a random uuid4 hex string - will be used for each file. + will be used for each file. This parameter is only supported by 'cudf' + engine, and will be ignored by other engines. partition_offsets : list, optional, default None Offsets to partition the dataframe by. Should be used when path is list of str. Should be a list of integers of size ``len(path) + 1`` diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index fc3a243572f..4b57bcd018a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "rmm==24.6.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -34,7 +34,7 @@ dependencies = [ "packaging", "pandas>=2.0,<2.2.3dev0", "ptxcompiler", - "pyarrow>=14.0.1,<15.0.0a0", + "pyarrow>=16.0.0,<17.0.0a0", "rich", "rmm==24.6.*", "typing_extensions>=4.0.0", @@ -63,7 +63,7 @@ test = [ "pytest<8", "scipy", "tokenizers==0.15.2", - "transformers==4.38.1", + "transformers==4.39.3", "tzdata", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index eb48852202a..787dd8a97d7 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==14.0.2.*", + "pyarrow==16.0.0.*", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 2c44f192612..39800145585 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir): pdf = ddf2.compute() pdf.to_parquet(fn, engine="pyarrow") read_df = dask_cudf.read_parquet(fn) - # Workaround until following issue is fixed: - # https://github.com/apache/arrow/issues/33321 - dd.assert_eq(ddf2, read_df.compute(), check_index_type=False) + dd.assert_eq(ddf2, read_df.compute()) @pytest.mark.parametrize("index", [False, None])