From f536e3017205be8b09f3dc2cfd448dc9c5a94d5d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 19 Jun 2024 16:50:48 +0100 Subject: [PATCH] Add basic tests of dataframe scan (#16003) Also assert that unsupported file scan operations raise. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16003 --- python/cudf_polars/cudf_polars/dsl/ir.py | 4 +- .../cudf_polars/testing/asserts.py | 34 ++++++++++++++- python/cudf_polars/docs/overview.md | 18 ++++++++ .../cudf_polars/tests/test_dataframescan.py | 43 +++++++++++++++++++ python/cudf_polars/tests/test_scan.py | 13 +++++- python/cudf_polars/tests/testing/__init__.py | 6 +++ .../cudf_polars/tests/testing/test_asserts.py | 35 +++++++++++++++ 7 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 python/cudf_polars/tests/test_dataframescan.py create mode 100644 python/cudf_polars/tests/testing/__init__.py create mode 100644 python/cudf_polars/tests/testing/test_asserts.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 83957e4286d..3ccefac6b0a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -196,7 +196,9 @@ def __post_init__(self) -> None: if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): - raise NotImplementedError(f"Unhandled scan type: {self.typ}") + raise NotImplementedError( + f"Unhandled scan type: {self.typ}" + ) # pragma: no cover; polars raises on the rust side for now def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 3edaa427432..a9a4ae5f0a6 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -11,6 +11,7 @@ from polars.testing.asserts import assert_frame_equal from cudf_polars.callback import execute_with_cudf +from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: from collections.abc import Mapping @@ -19,7 +20,7 @@ from cudf_polars.typing import OptimizationArgs -__all__: list[str] = ["assert_gpu_result_equal"] +__all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"] def assert_gpu_result_equal( @@ -84,3 +85,34 @@ def assert_gpu_result_equal( atol=atol, categorical_as_str=categorical_as_str, ) + + +def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) -> None: + """ + Assert that translation of a query raises an exception. + + Parameters + ---------- + q + Query to translate. + exceptions + Exceptions that one expects might be raised. + + Returns + ------- + None + If translation successfully raised the specified exceptions. + + Raises + ------ + AssertionError + If the specified exceptions were not raised. + """ + try: + _ = translate_ir(q._ldf.visit()) + except exceptions: + return + except Exception as e: + raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e + else: + raise AssertionError(f"Translation DID NOT RAISE {exceptions}") diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index b50d01c26db..874bb849747 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -224,6 +224,24 @@ def test_whatever(): assert_gpu_result_equal(query) ``` +## Test coverage and asserting failure modes + +Where translation of a query should fail due to the feature being +unsupported we should test this. To assert that _translation_ raises +an exception (usually `NotImplementedError`), use the utility function +`assert_ir_translation_raises`: + +```python +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_whatever(): + unsupported_query = ... + assert_ir_translation_raises(unsupported_query, NotImplementedError) +``` + +This test will fail if translation does not raise. + # Debugging If the callback execution fails during the polars `collect` call, we diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py new file mode 100644 index 00000000000..1ffe06ac562 --- /dev/null +++ b/python/cudf_polars/tests/test_dataframescan.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "subset", + [ + None, + ["a", "c"], + ["b", "c", "d"], + ["b", "d"], + ["b", "c"], + ["c", "e"], + ["d", "e"], + pl.selectors.string(), + pl.selectors.integer(), + ], +) +@pytest.mark.parametrize("predicate_pushdown", [False, True]) +def test_scan_drop_nulls(subset, predicate_pushdown): + df = pl.LazyFrame( + { + "a": [1, 2, 3, 4], + "b": [None, 4, 5, None], + "c": [6, 7, None, None], + "d": [8, None, 9, 10], + "e": [None, None, "A", None], + } + ) + # Drop nulls are pushed into filters + q = df.drop_nulls(subset) + + assert_gpu_result_equal( + q, collect_kwargs={"predicate_pushdown": predicate_pushdown} + ) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b2443e357e2..f129cc7ca32 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -6,7 +6,10 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.fixture( @@ -86,3 +89,11 @@ def test_scan(df, columns, mask): if columns is not None: q = df.select(*columns) assert_gpu_result_equal(q) + + +def test_scan_unsupported_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_ndjson(tmp_path / "df.json") + q = pl.scan_ndjson(tmp_path / "df.json") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/testing/__init__.py b/python/cudf_polars/tests/testing/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/testing/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py new file mode 100644 index 00000000000..5bc2fe1efb7 --- /dev/null +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) + + +def test_translation_assert_raises(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + + # This should succeed + assert_gpu_result_equal(df) + + with pytest.raises(AssertionError): + # This should fail, because we can translate this query. + assert_ir_translation_raises(df, NotImplementedError) + + class E(Exception): + pass + + unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + # Unsupported query should raise NotImplementedError + assert_ir_translation_raises(unsupported, NotImplementedError) + + with pytest.raises(AssertionError): + # This should fail, because we can't translate this query, but it doesn't raise E. + assert_ir_translation_raises(unsupported, E)