Skip to content

Commit

Permalink
Upgrade arrow to 16 (#15703)
Browse files Browse the repository at this point in the history
This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: #15703
  • Loading branch information
galipremsagar authored May 9, 2024
1 parent 69fe213 commit 3481042
Show file tree
Hide file tree
Showing 15 changed files with 36 additions and 64 deletions.
12 changes: 6 additions & 6 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libcufile-dev=1.4.0.31
- libcufile=1.4.0.31
- libcurand-dev=10.3.0.86
- libcurand=10.3.0.86
- libkvikio==24.6.*
- libparquet==14.0.2.*
- libparquet==16.0.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.6.*
- make
Expand All @@ -66,7 +66,7 @@ dependencies:
- pip
- pre-commit
- ptxcompiler
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- pydata-sphinx-theme!=0.14.2
- pytest-benchmark
- pytest-cases>=3.8.2
Expand All @@ -92,7 +92,7 @@ dependencies:
- streamz
- sysroot_linux-64==2.17
- tokenizers==0.15.2
- transformers==4.38.1
- transformers==4.39.3
- typing_extensions>=4.0.0
- zlib>=1.2.13
- pip:
Expand Down
12 changes: 6 additions & 6 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libcufile-dev
- libcurand-dev
- libkvikio==24.6.*
- libparquet==14.0.2.*
- libparquet==16.0.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.6.*
- make
Expand All @@ -63,7 +63,7 @@ dependencies:
- pandoc
- pip
- pre-commit
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink
- pytest-benchmark
Expand All @@ -90,7 +90,7 @@ dependencies:
- streamz
- sysroot_linux-64==2.17
- tokenizers==0.15.2
- transformers==4.38.1
- transformers==4.39.3
- typing_extensions>=4.0.0
- zlib>=1.2.13
- pip:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ requirements:
- setuptools
- dlpack >=0.8,<1.0
- numpy 1.23
- pyarrow ==14.0.2.*
- pyarrow ==16.0.0.*
- libcudf ={{ version }}
- rmm ={{ minor_version }}
{% if cuda_major == "11" %}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/libcudf/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ cmake_version:
- ">=3.26.4"

libarrow_version:
- "==14.0.2"
- "==16.0.0"

dlpack_version:
- ">=0.8,<1.0"
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
14.0.2
16.0.0
CACHE STRING "The version of Arrow to find (or build)"
)
endif()
Expand Down
24 changes: 11 additions & 13 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ dependencies:
- cython>=3.0.3
# Hard pin the patch version used during the build. This must be kept
# in sync with the version pinned in get_arrow.cmake.
- pyarrow==14.0.2.*
- pyarrow==16.0.0.*
- output_types: conda
packages:
- scikit-build-core>=0.7.0
Expand Down Expand Up @@ -312,27 +312,25 @@ dependencies:
packages:
# Hard pin the Arrow patch version used during the build. This must
# be kept in sync with the version pinned in get_arrow.cmake.
- libarrow-acero==14.0.2.*
- libarrow-dataset==14.0.2.*
- libarrow==14.0.2.*
- libparquet==14.0.2.*
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libparquet==16.0.0.*
libarrow_run:
common:
- output_types: conda
packages:
# Allow runtime version to float up to minor version
# Disallow libarrow 14.0.0 due to a CVE
- libarrow-acero>=14.0.1,<15.0.0a0
- libarrow-dataset>=14.0.1,<15.0.0a0
- libarrow>=14.0.1,<15.0.0a0
- libparquet>=14.0.1,<15.0.0a0
- libarrow-acero>=16.0.0,<17.0.0a0
- libarrow-dataset>=16.0.0,<17.0.0a0
- libarrow>=16.0.0,<17.0.0a0
- libparquet>=16.0.0,<17.0.0a0
pyarrow_run:
common:
- output_types: [conda, requirements, pyproject]
packages:
# Allow runtime version to float up to minor version
# Disallow pyarrow 14.0.0 due to a CVE
- pyarrow>=14.0.1,<15.0.0a0
- pyarrow>=16.0.0,<17.0.0a0
cuda_version:
specific:
- output_types: conda
Expand Down Expand Up @@ -631,7 +629,7 @@ dependencies:
packages:
- msgpack
- &tokenizers tokenizers==0.15.2
- &transformers transformers==4.38.1
- &transformers transformers==4.39.3
- tzdata
specific:
- output_types: conda
Expand Down
5 changes: 0 additions & 5 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,15 +993,10 @@ def to_parquet(
if index is None:
index = True

# Convert partition_file_name to a call back
if partition_file_name:
partition_file_name = lambda x: partition_file_name # noqa: E731

pa_table = df.to_arrow(preserve_index=index)
return pq.write_to_dataset(
pa_table,
root_path=path,
partition_filename_cb=partition_file_name,
partition_cols=partition_cols,
*args,
**kwargs,
Expand Down
Binary file modified python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
Binary file not shown.
14 changes: 1 addition & 13 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
]
pa_chunk_array = pa.chunked_array(np_list_data)

expect = pd.Series(pa_chunk_array.to_pandas())
if cudf.api.types.is_datetime64_dtype(
data_type
) or cudf.api.types.is_timedelta64_dtype(data_type):
# Workaround for an Arrow Bug:
# https://github.com/apache/arrow/issues/34462
expect = expect.astype(data_type)
expect = pa_chunk_array.to_pandas()
got = cudf.Series(pa_chunk_array)

assert_eq(expect, got)
Expand All @@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
)

expect = pa_table.to_pandas()
if cudf.api.types.is_datetime64_dtype(
data_type
) or cudf.api.types.is_timedelta64_dtype(data_type):
# Workaround for an Arrow Bug:
# https://github.com/apache/arrow/issues/34462
expect = expect.astype(data_type)
got = cudf.DataFrame.from_arrow(pa_table)

assert_eq(expect, got)
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
arrow_array = pa.Array.from_pandas(pdi)
expected_index = pd.Index(arrow_array.to_pandas())
gdi = cudf.Index.from_arrow(arrow_array)
if gdi.dtype == cudf.dtype("datetime64[s]"):
# Arrow bug:
# https://github.com/apache/arrow/issues/33321
# arrow cannot convert non-nanosecond
# resolution to appropriate type in pandas.
# Hence need to type-cast.
expected_index = expected_index.astype(gdi.dtype)

assert_eq(expected_index, gdi)


Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
# Because of this, we aren't using PyArrow as a reference for testing our
# row-group selection method since the only way to only select row groups
# with PyArrow is with the method we use and intend to test.
tbl_filtered = pq.read_table(
fname, filters=[("1", ">", 60)], use_legacy_dataset=False
)
tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])

assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
print(len(df_filtered))
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,8 @@
File name to use for partitioned datasets. Different partitions
will be written to different directories, but all files will
have this name. If nothing is specified, a random uuid4 hex string
will be used for each file.
will be used for each file. This parameter is only supported by 'cudf'
engine, and will be ignored by other engines.
partition_offsets : list, optional, default None
Offsets to partition the dataframe by. Should be used when path is list
of str. Should be a list of integers of size ``len(path) + 1``
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires = [
"cython>=3.0.3",
"ninja",
"numpy==1.23.*",
"pyarrow==14.0.2.*",
"pyarrow==16.0.0.*",
"rmm==24.6.*",
"scikit-build-core[pyproject]>=0.7.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
Expand All @@ -34,7 +34,7 @@ dependencies = [
"packaging",
"pandas>=2.0,<2.2.3dev0",
"ptxcompiler",
"pyarrow>=14.0.1,<15.0.0a0",
"pyarrow>=16.0.0,<17.0.0a0",
"rich",
"rmm==24.6.*",
"typing_extensions>=4.0.0",
Expand Down Expand Up @@ -63,7 +63,7 @@ test = [
"pytest<8",
"scipy",
"tokenizers==0.15.2",
"transformers==4.38.1",
"transformers==4.39.3",
"tzdata",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
pandas-tests = [
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_kafka/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires = [
"cython>=3.0.3",
"ninja",
"numpy==1.23.*",
"pyarrow==14.0.2.*",
"pyarrow==16.0.0.*",
"scikit-build-core[pyproject]>=0.7.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

Expand Down
4 changes: 1 addition & 3 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
pdf = ddf2.compute()
pdf.to_parquet(fn, engine="pyarrow")
read_df = dask_cudf.read_parquet(fn)
# Workaround until following issue is fixed:
# https://github.com/apache/arrow/issues/33321
dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
dd.assert_eq(ddf2, read_df.compute())


@pytest.mark.parametrize("index", [False, None])
Expand Down

0 comments on commit 3481042

Please sign in to comment.