Relax Arrow pin (rapidsai#16681)

With this change, cudf users can install any version of pyarrow greater than 14. This is the minimum version supporting the C Data Interface, which is a requirement for us (it may be possible to relax in principle, but would require changes to the cudf/pylibcudf code). A few tests are skipped due to bugs or missing features in older versions of pyarrow. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - James Lamb (https://github.com/jameslamb) URL: rapidsai#16681
JayjeetAtGithub · Aug 28, 2024 · 925530a · 925530a
1 parent dba6c1f
commit 925530a
Show file tree

Hide file tree

Showing 10 changed files with 28 additions and 22 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,6 +67,7 @@ dependencies:
 - pandoc
 - pre-commit
 - ptxcompiler
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,6 +64,7 @@ dependencies:
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pre-commit
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -82,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
@@ -79,7 +79,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -19,6 +19,7 @@ files:
       - docs
       - notebooks
       - py_version
+      - pyarrow_run
       - rapids_build_skbuild
       - rapids_build_setuptools
       - run_common
@@ -46,7 +47,6 @@ files:
     includes:
       - cuda_version
       - py_version
-      - pyarrow_run
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
@@ -136,13 +136,6 @@ files:
       - build_base
       - build_cpp
       - depends_on_librmm
-  py_run_libcudf:
-    output: pyproject
-    pyproject_dir: python/libcudf
-    extras:
-      table: project
-    includes:
-      - pyarrow_run
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -390,8 +383,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to patch version
-          - pyarrow>=16.1.0,<16.2.0a0
+          - pyarrow>=14.0.0,<18.0.0a0
   cuda_version:
     specific:
       - output_types: conda

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -515,10 +515,6 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
     )
 
 
-@pytest.mark.skipif(
-    version.parse(pa.__version__) < version.parse("1.0.1"),
-    reason="pyarrow 1.0.0 needed for various operators and operand types",
-)
 @pytest.mark.parametrize(
     "predicate,expected_len",
     [
@@ -2393,6 +2389,10 @@ def test_parquet_writer_list_large_mixed(tmpdir):
 
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_list_chunked(tmpdir, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2578,6 +2578,10 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("add_nulls", [True, False])
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2957,6 +2961,10 @@ def test_per_column_options_string_col(tmpdir, encoding):
     assert encoding in fmd.row_group(0).column(0).encodings
 
 
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/39748",
+)
 @pytest.mark.parametrize(
     "num_rows",
     [200, 10000],
@@ -3557,6 +3565,10 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
 
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_with_arrow_schema(index):
     # Ensure that the concrete and nested types are faithfully being roundtripped
     # across Parquet with arrow schema
@@ -3707,6 +3719,10 @@ def test_parquet_writer_int96_timestamps_and_arrow_schema():
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     tmpdir, data, index
 ):

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rich",
     "rmm==24.10.*,>=0.0.0a0",

diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
@@ -37,9 +37,6 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
-dependencies = [
-    "pyarrow>=16.1.0,<16.2.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"

diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
@@ -152,7 +152,6 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 
 
 @from_arrow.register(pa.Array)
-@from_arrow.register(pa.ChunkedArray)
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")

diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "libcudf==24.10.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.