From 416042314e16c1bfc7499309ccc4a352b1f47c0a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 25 Sep 2024 15:42:51 +0100 Subject: [PATCH 1/2] Pin polars for 24.10 and update polars test suite xfail list (#16886) For releases, since the polars release cadence is quite a lot faster than rapids, we propose to hard-pin to a known good version. In this case, 1.8.x. At the same time, remove pin in CI scripts and update list of xfailing tests in the polars test suite. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - James Lamb (https://github.com/jameslamb) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16886 --- ci/run_cudf_polars_polars_tests.sh | 2 +- ci/test_cudf_polars_polars_tests.sh | 3 +- ci/test_wheel_cudf_polars.sh | 5 +-- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/__init__.py | 8 +++-- .../cudf_polars/cudf_polars/dsl/translate.py | 8 ----- .../cudf_polars/testing/asserts.py | 14 +++++--- .../cudf_polars/cudf_polars/testing/plugin.py | 4 +++ .../cudf_polars/cudf_polars/utils/versions.py | 16 ++++----- python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/test_groupby.py | 6 +++- .../cudf_polars/tests/testing/test_asserts.py | 35 ++++++++----------- 12 files changed, 51 insertions(+), 54 deletions(-) diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh index 52a827af94c..95f78f17f2f 100755 --- a/ci/run_cudf_polars_polars_tests.sh +++ b/ci/run_cudf_polars_polars_tests.sh @@ -21,7 +21,7 @@ python -m pytest \ -m "" \ -p cudf_polars.testing.plugin \ -v \ - --tb=short \ + --tb=native \ ${DESELECTED_TESTS} \ "$@" \ py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 6c728a9537f..bfc8fd37565 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl rapids-logger "Install cudf_polars" python -m pip install $(echo ./dist/cudf_polars*.whl) -# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') -TAG="py-1.7.0" +TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index b4509bba02e..3116bd820e9 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -39,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then | tee ./constraints.txt fi -# echo to expand wildcard before adding `[extra]` requires for pip +# echo to expand wildcard before adding `[test]` requires for pip python -m pip install \ -v \ --constraint ./constraints.txt \ @@ -47,9 +47,6 @@ python -m pip install \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Pin to 1.7.0 Temporarily" -python -m pip install polars==1.7.0 - rapids-logger "Run cudf_polars tests" function set_exitcode() diff --git a/dependencies.yaml b/dependencies.yaml index 7a9c9b8486d..339adbc5ff9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -663,7 +663,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.6 + - polars>=1.8,<1.9 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index c1317e8f467..66c15f694ee 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,13 +10,15 @@ from __future__ import annotations -# Check we have a supported polars version -import cudf_polars.utils.versions as v from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir -del v +# Check we have a supported polars version +from cudf_polars.utils.versions import _ensure_polars_version + +_ensure_polars_version() +del _ensure_polars_version __all__: list[str] = [ "execute_with_cudf", diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 45881afe0c8..a0291037f01 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -93,14 +93,6 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) - if ( - typ == "csv" - and visitor.version()[0] == 1 - and reader_options["schema"] is not None - ): - reader_options["schema"] = { - "fields": reader_options["schema"]["inner"] - } # pragma: no cover; CI tests 1.7 file_options = node.file_options with_columns = file_options.with_columns n_rows = file_options.n_rows diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index a79d45899cd..7b6f3848fc4 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -164,9 +164,11 @@ def assert_collect_raises( cudf-polars. Useful for controlling optimization settings. polars_except - Exception or exceptions polars CPU is expected to raise. + Exception or exceptions polars CPU is expected to raise. If + None, CPU is not expected to raise an exception. cudf_except - Exception or exceptions polars GPU is expected to raise. + Exception or exceptions polars GPU is expected to raise. If + None, GPU is not expected to raise an exception. collect_kwargs Common keyword arguments to pass to collect for both polars CPU and cudf-polars. @@ -203,7 +205,8 @@ def assert_collect_raises( f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" ) from e else: - raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + if polars_except != (): + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") engine = GPUEngine(raise_on_fail=True) try: @@ -212,7 +215,8 @@ def assert_collect_raises( pass except Exception as e: raise AssertionError( - f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}" + f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}" ) from e else: - raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}") + if cudf_except != (): + raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index c40d59e6d33..05b76d76808 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config): "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 2e6efde968c..4a7ad6b3cf2 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,11 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6") -POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6") -POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6") - -if POLARS_VERSION_LT_16: - raise ImportError( - "cudf_polars requires py-polars v1.6 or greater." - ) # pragma: no cover +POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") + + +def _ensure_polars_version(): + if POLARS_VERSION_LT_18: + raise ImportError( + "cudf_polars requires py-polars v1.8 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 857a8c14b2f..268364f72a7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.6", + "polars>=1.8,<1.9", "pylibcudf==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 6f996e0e0ec..74bf8b9e4e2 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -168,7 +168,11 @@ def test_groupby_nan_minmax_raises(op): "expr", [ pl.lit(1).alias("value"), - pl.lit([[4, 5, 6]]).alias("value"), + pytest.param( + pl.lit([[4, 5, 6]]).alias("value"), + marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"), + ), + pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), pl.col("float") * (1 - pl.col("int")), [pl.lit(2).alias("value"), pl.col("float") * 2], ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 8e7f1a09d9b..ace1c6b8648 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -7,8 +7,6 @@ import polars as pl -from cudf_polars.containers import DataFrame -from cudf_polars.dsl.ir import Select from cudf_polars.testing.asserts import ( assert_collect_raises, assert_gpu_result_equal, @@ -38,14 +36,24 @@ class E(Exception): assert_ir_translation_raises(unsupported, E) -def test_collect_assert_raises(monkeypatch): +def test_collect_assert_raises(): df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - with pytest.raises(AssertionError): - # This should raise, because polars CPU can run this query + with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"): + # This should raise, because polars CPU can run this query, + # but we expect an error. assert_collect_raises( df, polars_except=pl.exceptions.InvalidOperationError, + cudf_except=(), + ) + + with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"): + # This should raise, because polars GPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=(), cudf_except=pl.exceptions.InvalidOperationError, ) @@ -60,7 +68,7 @@ def test_collect_assert_raises(monkeypatch): cudf_except=pl.exceptions.InvalidOperationError, ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError, match="GPU execution RAISED"): # This should raise because the expected GPU error is wrong assert_collect_raises( q, @@ -68,23 +76,10 @@ def test_collect_assert_raises(monkeypatch): cudf_except=NotImplementedError, ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError, match="CPU execution RAISED"): # This should raise because the expected CPU error is wrong assert_collect_raises( q, polars_except=NotImplementedError, cudf_except=pl.exceptions.InvalidOperationError, ) - - with monkeypatch.context() as m: - m.setattr(Select, "evaluate", lambda self, cache: DataFrame([])) - # This query should fail, but we monkeypatch a bad - # implementation of Select which "succeeds" to check that our - # assertion notices this case. - q = df.select(pl.col("a") + pl.Series([1, 2])) - with pytest.raises(AssertionError): - assert_collect_raises( - q, - polars_except=pl.exceptions.ComputeError, - cudf_except=pl.exceptions.ComputeError, - ) From ef270827cc3e4f336258d1e1ad4b7f633656409b Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:52:34 -0500 Subject: [PATCH 2/2] Build `cudf-polars` with `build.sh` (#16898) This PR adds `cudf-polars` to the top level build script. Authors: - https://github.com/brandon-b-miller - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/16898 --- build.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 69d6481af42..56359eae235 100755 --- a/build.sh +++ b/build.sh @@ -17,13 +17,14 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" -HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] +VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" +HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only pylibcudf - build the pylibcudf Python package cudf - build the cudf Python package + cudf_polars - build the cudf_polars Python package cudfjar - build cudf JAR with static libcudf using devtoolset toolchain dask_cudf - build the dask_cudf Python package benchmarks - build benchmarks @@ -353,6 +354,12 @@ if buildAll || hasArg cudf; then python ${PYTHON_ARGS_FOR_INSTALL} . fi +# Build and install the cudf_polars Python package +if buildAll || hasArg cudf_polars; then + + cd ${REPODIR}/python/cudf_polars + python ${PYTHON_ARGS_FOR_INSTALL} . +fi # Build and install the dask_cudf Python package if buildAll || hasArg dask_cudf; then