Merge branch 'branch-24.12' into fea-strings-find-re

rapidsai · Sep 25, 2024 · f703265 · f703265
2 parents 99651c1 + ba7d6e7
commit f703265
Show file tree

Hide file tree

Showing 13 changed files with 60 additions and 56 deletions.
diff --git a/build.sh b/build.sh
@@ -17,13 +17,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
    pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
+   cudf_polars                   - build the cudf_polars Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
    benchmarks                    - build benchmarks
@@ -353,6 +354,12 @@ if buildAll || hasArg cudf; then
         python ${PYTHON_ARGS_FOR_INSTALL} .
 fi
 
+# Build and install the cudf_polars Python package
+if buildAll || hasArg cudf_polars; then
+
+    cd ${REPODIR}/python/cudf_polars
+    python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
 
 # Build and install the dask_cudf Python package
 if buildAll || hasArg dask_cudf; then

diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
@@ -21,7 +21,7 @@ python -m pytest \
        -m "" \
        -p cudf_polars.testing.plugin \
        -v \
-       --tb=short \
+       --tb=native \
        ${DESELECTED_TESTS} \
        "$@" \
        py-polars/tests
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
@@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)
 
-# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-TAG="py-1.7.0"
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
 git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
 

diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
@@ -39,17 +39,14 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
       | tee ./constraints.txt
 fi
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Pin to 1.7.0 Temporarily"
-python -m pip install polars==1.7.0
-
 rapids-logger "Run cudf_polars tests"
 
 function set_exitcode()

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -663,7 +663,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.6
+          - polars>=1.8,<1.9
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]

diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,13 +10,15 @@
 
 from __future__ import annotations
 
-# Check we have a supported polars version
-import cudf_polars.utils.versions as v
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
-del v
+# Check we have a supported polars version
+from cudf_polars.utils.versions import _ensure_polars_version
+
+_ensure_polars_version()
+del _ensure_polars_version
 
 __all__: list[str] = [
     "execute_with_cudf",

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -93,14 +93,6 @@ def _(
         cloud_options = None
     else:
         reader_options, cloud_options = map(json.loads, options)
-    if (
-        typ == "csv"
-        and visitor.version()[0] == 1
-        and reader_options["schema"] is not None
-    ):
-        reader_options["schema"] = {
-            "fields": reader_options["schema"]["inner"]
-        }  # pragma: no cover; CI tests 1.7
     file_options = node.file_options
     with_columns = file_options.with_columns
     n_rows = file_options.n_rows

diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -164,9 +164,11 @@ def assert_collect_raises(
         cudf-polars.
         Useful for controlling optimization settings.
     polars_except
-        Exception or exceptions polars CPU is expected to raise.
+        Exception or exceptions polars CPU is expected to raise. If
+        None, CPU is not expected to raise an exception.
     cudf_except
-        Exception or exceptions polars GPU is expected to raise.
+        Exception or exceptions polars GPU is expected to raise. If
+        None, GPU is not expected to raise an exception.
     collect_kwargs
         Common keyword arguments to pass to collect for both polars CPU and
         cudf-polars.
@@ -203,7 +205,8 @@ def assert_collect_raises(
             f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
         ) from e
     else:
-        raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
+        if polars_except != ():
+            raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
 
     engine = GPUEngine(raise_on_fail=True)
     try:
@@ -212,7 +215,8 @@ def assert_collect_raises(
         pass
     except Exception as e:
         raise AssertionError(
-            f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+            f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}"
         ) from e
     else:
-        raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}")
+        if cudf_except != ():
+            raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
     "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",

diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,11 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6")
-POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6")
-POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6")
-
-if POLARS_VERSION_LT_16:
-    raise ImportError(
-        "cudf_polars requires py-polars v1.6 or greater."
-    )  # pragma: no cover
+POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+
+
+def _ensure_polars_version():
+    if POLARS_VERSION_LT_18:
+        raise ImportError(
+            "cudf_polars requires py-polars v1.8 or greater."
+        )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.6",
+    "polars>=1.8,<1.9",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
@@ -168,7 +168,11 @@ def test_groupby_nan_minmax_raises(op):
     "expr",
     [
         pl.lit(1).alias("value"),
-        pl.lit([[4, 5, 6]]).alias("value"),
+        pytest.param(
+            pl.lit([[4, 5, 6]]).alias("value"),
+            marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"),
+        ),
+        pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)),
         pl.col("float") * (1 - pl.col("int")),
         [pl.lit(2).alias("value"), pl.col("float") * 2],
     ],

diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
@@ -7,8 +7,6 @@
 
 import polars as pl
 
-from cudf_polars.containers import DataFrame
-from cudf_polars.dsl.ir import Select
 from cudf_polars.testing.asserts import (
     assert_collect_raises,
     assert_gpu_result_equal,
@@ -38,14 +36,24 @@ class E(Exception):
         assert_ir_translation_raises(unsupported, E)
 
 
-def test_collect_assert_raises(monkeypatch):
+def test_collect_assert_raises():
     df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
 
-    with pytest.raises(AssertionError):
-        # This should raise, because polars CPU can run this query
+    with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"):
+        # This should raise, because polars CPU can run this query,
+        # but we expect an error.
         assert_collect_raises(
             df,
             polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=(),
+        )
+
+    with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"):
+        # This should raise, because polars GPU can run this query,
+        # but we expect an error.
+        assert_collect_raises(
+            df,
+            polars_except=(),
             cudf_except=pl.exceptions.InvalidOperationError,
         )
 
@@ -60,31 +68,18 @@ def test_collect_assert_raises(monkeypatch):
         cudf_except=pl.exceptions.InvalidOperationError,
     )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="GPU execution RAISED"):
         # This should raise because the expected GPU error is wrong
         assert_collect_raises(
             q,
             polars_except=pl.exceptions.InvalidOperationError,
             cudf_except=NotImplementedError,
         )
 
-    with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError, match="CPU execution RAISED"):
         # This should raise because the expected CPU error is wrong
         assert_collect_raises(
             q,
             polars_except=NotImplementedError,
             cudf_except=pl.exceptions.InvalidOperationError,
         )
-
-    with monkeypatch.context() as m:
-        m.setattr(Select, "evaluate", lambda self, cache: DataFrame([]))
-        # This query should fail, but we monkeypatch a bad
-        # implementation of Select which "succeeds" to check that our
-        # assertion notices this case.
-        q = df.select(pl.col("a") + pl.Series([1, 2]))
-        with pytest.raises(AssertionError):
-            assert_collect_raises(
-                q,
-                polars_except=pl.exceptions.ComputeError,
-                cudf_except=pl.exceptions.ComputeError,
-            )