Skip to content

Commit

Permalink
Relax Arrow pin (rapidsai#16681)
Browse files Browse the repository at this point in the history
With this change, cudf users can install any version of pyarrow greater than 14. This is the minimum version supporting the C Data Interface, which is a requirement for us (it may be possible to relax in principle, but would require changes to the cudf/pylibcudf code). A few tests are skipped due to bugs or missing features in older versions of pyarrow.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - James Lamb (https://github.com/jameslamb)

URL: rapidsai#16681
  • Loading branch information
vyasr authored Aug 28, 2024
1 parent dba6c1f commit 925530a
Show file tree
Hide file tree
Showing 10 changed files with 28 additions and 22 deletions.
1 change: 1 addition & 0 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ dependencies:
- pandoc
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<18.0.0a0
- pydata-sphinx-theme!=0.14.2
- pytest-benchmark
- pytest-cases>=3.8.2
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ dependencies:
- pandas>=2.0,<2.2.3dev0
- pandoc
- pre-commit
- pyarrow>=14.0.0,<18.0.0a0
- pydata-sphinx-theme!=0.14.2
- pynvjitlink>=0.0.0a0
- pytest-benchmark
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ requirements:
- cupy >=12.0.0
- numba >=0.57
- numpy >=1.23,<3.0a0
- pyarrow ==16.1.0.*
- pyarrow>=14.0.0,<18.0.0a0
- libcudf ={{ version }}
- pylibcudf ={{ version }}
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
- numpy >=1.23,<3.0a0
- pyarrow ==16.1.0.*
- pyarrow>=14.0.0,<18.0.0a0
- {{ pin_compatible('rmm', max_pin='x.x') }}
- fsspec >=0.6.0
{% if cuda_major == "11" %}
Expand Down
12 changes: 2 additions & 10 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ files:
- docs
- notebooks
- py_version
- pyarrow_run
- rapids_build_skbuild
- rapids_build_setuptools
- run_common
Expand Down Expand Up @@ -46,7 +47,6 @@ files:
includes:
- cuda_version
- py_version
- pyarrow_run
- test_python_common
- test_python_cudf
- test_python_dask_cudf
Expand Down Expand Up @@ -136,13 +136,6 @@ files:
- build_base
- build_cpp
- depends_on_librmm
py_run_libcudf:
output: pyproject
pyproject_dir: python/libcudf
extras:
table: project
includes:
- pyarrow_run
py_build_pylibcudf:
output: pyproject
pyproject_dir: python/pylibcudf
Expand Down Expand Up @@ -390,8 +383,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
# Allow runtime version to float up to patch version
- pyarrow>=16.1.0,<16.2.0a0
- pyarrow>=14.0.0,<18.0.0a0
cuda_version:
specific:
- output_types: conda
Expand Down
24 changes: 20 additions & 4 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,10 +515,6 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
)


@pytest.mark.skipif(
version.parse(pa.__version__) < version.parse("1.0.1"),
reason="pyarrow 1.0.0 needed for various operators and operand types",
)
@pytest.mark.parametrize(
"predicate,expected_len",
[
Expand Down Expand Up @@ -2393,6 +2389,10 @@ def test_parquet_writer_list_large_mixed(tmpdir):

@pytest.mark.parametrize("store_schema", [True, False])
def test_parquet_writer_list_chunked(tmpdir, store_schema):
if store_schema and version.parse(pa.__version__) < version.parse(
"15.0.0"
):
pytest.skip("https://github.com/apache/arrow/pull/37792")
table1 = cudf.DataFrame(
{
"a": list_gen(string_gen, 128, 80, 50),
Expand Down Expand Up @@ -2578,6 +2578,10 @@ def normalized_equals(value1, value2):
@pytest.mark.parametrize("add_nulls", [True, False])
@pytest.mark.parametrize("store_schema", [True, False])
def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
if store_schema and version.parse(pa.__version__) < version.parse(
"15.0.0"
):
pytest.skip("https://github.com/apache/arrow/pull/37792")
file_path = tmpdir.join("cudf.parquet")
if "col_category" in pdf.columns:
pdf = pdf.drop(columns=["col_category", "col_bool"])
Expand Down Expand Up @@ -2957,6 +2961,10 @@ def test_per_column_options_string_col(tmpdir, encoding):
assert encoding in fmd.row_group(0).column(0).encodings


@pytest.mark.skipif(
version.parse(pa.__version__) < version.parse("16.0.0"),
reason="https://github.com/apache/arrow/pull/39748",
)
@pytest.mark.parametrize(
"num_rows",
[200, 10000],
Expand Down Expand Up @@ -3557,6 +3565,10 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):


@pytest.mark.parametrize("index", [None, True, False])
@pytest.mark.skipif(
version.parse(pa.__version__) < version.parse("15.0.0"),
reason="https://github.com/apache/arrow/pull/37792",
)
def test_parquet_writer_roundtrip_with_arrow_schema(index):
# Ensure that the concrete and nested types are faithfully being roundtripped
# across Parquet with arrow schema
Expand Down Expand Up @@ -3707,6 +3719,10 @@ def test_parquet_writer_int96_timestamps_and_arrow_schema():
],
)
@pytest.mark.parametrize("index", [None, True, False])
@pytest.mark.skipif(
version.parse(pa.__version__) < version.parse("15.0.0"),
reason="https://github.com/apache/arrow/pull/37792",
)
def test_parquet_writer_roundtrip_structs_with_arrow_schema(
tmpdir, data, index
):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies = [
"packaging",
"pandas>=2.0,<2.2.3dev0",
"ptxcompiler",
"pyarrow>=16.1.0,<16.2.0a0",
"pyarrow>=14.0.0,<18.0.0a0",
"pylibcudf==24.10.*,>=0.0.0a0",
"rich",
"rmm==24.10.*,>=0.0.0a0",
Expand Down
3 changes: 0 additions & 3 deletions python/libcudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,6 @@ classifiers = [
"Programming Language :: C++",
"Environment :: GPU :: NVIDIA CUDA",
]
dependencies = [
"pyarrow>=16.1.0,<16.2.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

[project.urls]
Homepage = "https://github.com/rapidsai/cudf"
Expand Down
1 change: 0 additions & 1 deletion python/pylibcudf/pylibcudf/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):


@from_arrow.register(pa.Array)
@from_arrow.register(pa.ChunkedArray)
def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
if data_type is not None:
raise ValueError("data_type may not be passed for arrays")
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies = [
"libcudf==24.10.*,>=0.0.0a0",
"nvtx>=0.2.1",
"packaging",
"pyarrow>=16.1.0,<16.2.0a0",
"pyarrow>=14.0.0,<18.0.0a0",
"rmm==24.10.*,>=0.0.0a0",
"typing_extensions>=4.0.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
Expand Down

0 comments on commit 925530a

Please sign in to comment.