From ad1369d2d6eabf4b0ae480a10463a74f3034aece Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 5 Sep 2024 01:11:07 +0200 Subject: [PATCH 01/19] CI: Test against old versions of key dependencies (#16570) This adds explicit tests with old versions of key dependencies. Specifically: - `numba==0.57` - `numpy==1.23` - `pandas==2.0` - ~`fsspec==0.6.0`~ excluded it. `transformers==4.39.3` requires `huggingface_hub` which requires `fsspec>=2023.5.0`. In principle one could include it e.g. only for conda which doesn't pull in `transformers`, but that seemed not worth the trouble? - `cupy==12.0.0` - `pyarrow==16.1.0` See also https://github.com/rapidsai/build-planning/issues/81 (Marking as draft until I see that things work.) Authors: - Sebastian Berg (https://github.com/seberg) - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/16570 --- ci/cudf_pandas_scripts/run_tests.sh | 13 +- ci/test_python_common.sh | 3 +- ci/test_wheel_cudf.sh | 14 ++ ci/test_wheel_cudf_polars.sh | 11 ++ ci/test_wheel_dask_cudf.sh | 13 ++ dependencies.yaml | 22 +++ .../cudf/cudf/tests/indexes/test_interval.py | 4 + .../test_avro_reader_fastavro_integration.py | 5 + python/cudf/cudf/tests/test_binops.py | 41 +++++- python/cudf/cudf/tests/test_categorical.py | 5 + python/cudf/cudf/tests/test_concat.py | 99 ++++++++----- python/cudf/cudf/tests/test_csv.py | 12 +- python/cudf/cudf/tests/test_dataframe.py | 19 ++- python/cudf/cudf/tests/test_datetime.py | 35 ++++- python/cudf/cudf/tests/test_doctests.py | 5 + python/cudf/cudf/tests/test_groupby.py | 112 +++++++++++++++ python/cudf/cudf/tests/test_index.py | 37 ++++- python/cudf/cudf/tests/test_indexing.py | 8 ++ python/cudf/cudf/tests/test_interpolate.py | 4 + python/cudf/cudf/tests/test_interval.py | 5 + python/cudf/cudf/tests/test_join_order.py | 130 +++++++++++++++++- python/cudf/cudf/tests/test_mvc.py | 8 +- python/cudf/cudf/tests/test_numerical.py | 3 +- python/cudf/cudf/tests/test_orc.py | 8 +- python/cudf/cudf/tests/test_parquet.py | 5 + python/cudf/cudf/tests/test_reductions.py | 5 + python/cudf/cudf/tests/test_replace.py | 20 ++- python/cudf/cudf/tests/test_resampling.py | 9 ++ python/cudf/cudf/tests/test_reshape.py | 17 ++- python/cudf/cudf/tests/test_stats.py | 8 ++ .../cudf_pandas_tests/test_cudf_pandas.py | 12 +- .../dask_cudf/tests/test_applymap.py | 6 + .../dask_cudf/tests/test_distributed.py | 5 + .../dask_cudf/dask_cudf/tests/test_groupby.py | 5 + 34 files changed, 638 insertions(+), 70 deletions(-) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 8b85695c861..1c2724a9a5d 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -54,8 +54,19 @@ else RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist - # echo to expand wildcard before adding `[extra]` requires for pip + echo "" > ./constraints.txt + if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + rapids-dependency-file-generator \ + --output requirements \ + --file-key test_python \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt + fi + python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index e8849588aa5..d0675b0431a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -14,7 +14,8 @@ ENV_YAML_DIR="$(mktemp -d)" rapids-dependency-file-generator \ --output conda \ --file-key test_python \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee "${ENV_YAML_DIR}/env.yaml" rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 6861d699695..28ded2f8e0f 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -10,8 +10,22 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +rapids-logger "Install cudf, pylibcudf, and test requirements" + +# Constrain to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_cudf \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi + # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 0baf6c9e277..9844090258a 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -25,9 +25,20 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Installing cudf_polars and its dependencies" +# Constraint to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_cudf_polars \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index fa74b2398f7..0d39807d56c 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -11,8 +11,21 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements" +# Constraint to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_dask_cudf \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi + # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ diff --git a/dependencies.yaml b/dependencies.yaml index c6851d9cb90..f8b231efd6d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -696,6 +696,28 @@ dependencies: - pytest<8 - pytest-cov - pytest-xdist + specific: + # Define additional constraints for testing with oldest dependencies. + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numba==0.57.* + - numpy==1.23.* + - pandas==2.0.* + - pyarrow==14.0.0 + - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: requirements + # Using --constraints for pip install, so we list cupy multiple times + matrices: + - matrix: {dependencies: "oldest"} + packages: + - cupy-cuda11x==12.0.0 + - cupy-cuda12x==12.0.0 + - matrix: + packages: test_python_pylibcudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 6653a94c9be..25edf788daf 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -149,6 +149,10 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t): assert_eq(pindex, gindex) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_interval_range_periods_warnings(): start_val, end_val, periods_val = 0, 4, 1.0 diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 2ec1d1d2f28..9d69e626c3d 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -23,6 +23,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -302,6 +303,10 @@ def get_days_from_epoch(date: datetime.date | None) -> int | None: @pytest.mark.parametrize("namespace", [None, "root_ns"]) @pytest.mark.parametrize("nullable", [True, False]) @pytest.mark.parametrize("prepend_null", [True, False]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas (datetime(9999, ...) too large)", +) def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): avro_type = {"logicalType": "date", "type": "int"} if nullable: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 4256ec872e6..2e8519509e2 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,11 @@ import cudf from cudf import Index, Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing import _utils as utils, assert_eq from cudf.utils.dtypes import ( @@ -1781,6 +1785,20 @@ def test_datetime_dateoffset_binaryop( reason="https://github.com/pandas-dev/pandas/issues/57448", ) ) + if ( + not PANDAS_GE_220 + and dtype in {"datetime64[ms]", "datetime64[s]"} + and frequency in ("microseconds", "nanoseconds") + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + if ( + not PANDAS_GE_220 + and dtype == "datetime64[us]" + and frequency == "nanoseconds" + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") date_col = [ f"2000-01-01 00:00:{components}", @@ -1834,7 +1852,11 @@ def test_datetime_dateoffset_binaryop( "ignore:Discarding nonzero nanoseconds:UserWarning" ) @pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") psr = gsr.to_pandas() @@ -1873,6 +1895,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): def test_datetime_dateoffset_binaryop_reflected( n_periods, frequency, dtype, components ): + if ( + not PANDAS_GE_220 + and dtype in {"datetime64[ms]", "datetime64[s]"} + and frequency in ("microseconds", "nanoseconds") + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + if ( + not PANDAS_GE_220 + and dtype == "datetime64[us]" + and frequency == "nanoseconds" + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + date_col = [ f"2000-01-01 00:00:{components}", f"2000-01-31 00:00:{components}", diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ae58af8ebce..cd1ad21ae59 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal @@ -858,6 +859,10 @@ def test_cat_from_scalar(scalar): assert_eq(ps, gs) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_cat_groupby_fillna(): ps = pd.Series(["a", "b", "c"], dtype="category") gs = cudf.from_pandas(ps) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index c1c03de48d4..8da589ba45b 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,6 +9,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @@ -451,45 +452,75 @@ def test_concat_mixed_input(): [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], - [ - pd.Series([1, 2, 3.0, 1.2], name="abc"), - pd.DataFrame({"a": [1, 2]}), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] - ), - pd.DataFrame({"a": [1, 2]}), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + pytest.param( + [ + pd.Series([1, 2, 3.0, 1.2], name="abc"), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] + ), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + ), + pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ] + * 7, + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - ] - * 7, + ), ], ) def test_concat_series_dataframe_input(objs): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 40ba415e681..cee3d23eadc 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -16,9 +16,13 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if def make_numeric_dataframe(nrows, dtype): @@ -1270,14 +1274,14 @@ def test_csv_reader_delim_whitespace(): # with header row with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True) assert_eq(pd_df, cu_df) # without header row with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): pd_df = pd.read_csv( StringIO(buffer), delim_whitespace=True, header=None ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9122a1074ac..f4d1578bda7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -26,7 +26,11 @@ import cudf from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.errors import MixedTypeError @@ -3561,8 +3565,11 @@ def test_dataframe_empty_sort_index(): @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_sort_index( - index, axis, ascending, inplace, ignore_index, na_position + request, index, axis, ascending, inplace, ignore_index, na_position ): + if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index: + pytest.skip(reason="Bug fixed in pandas-2.2") + pdf = pd.DataFrame( {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index, @@ -3612,6 +3619,10 @@ def test_dataframe_sort_index( @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_dataframe_mulitindex_sort_index( request, axis, level, ascending, inplace, ignore_index, na_position ): @@ -6747,6 +6758,10 @@ def test_dataframe_init_from_arrays_cols(data, cols, index): None, ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_dataframe_assign_scalar(request, col_data, assign_val): request.applymarker( pytest.mark.xfail( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7be4faa42c3..4a2345fc009 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -14,7 +14,11 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.index import DatetimeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -801,6 +805,10 @@ def test_to_datetime_different_formats_notimplemented(): cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas.", +) def test_datetime_can_cast_safely(): sr = cudf.Series( ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" @@ -847,6 +855,10 @@ def test_datetime_array_timeunit_cast(dtype): @pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_datetime_scalar_timeunit_cast(timeunit): testscalar = np.datetime64("2016-11-20", timeunit) @@ -1535,6 +1547,10 @@ def test_date_range_start_end_periods(start, end, periods): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_date_range_start_end_freq(start, end, freq): if isinstance(freq, str): _gfreq = _pfreq = freq @@ -1551,6 +1567,10 @@ def test_date_range_start_end_freq(start, end, freq): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_date_range_start_freq_periods(start, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq @@ -1643,6 +1663,9 @@ def test_date_range_raise_overflow(): ], ) def test_date_range_raise_unsupported(freqstr_unsupported): + if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"): + pytest.skip(reason="YE, etc. support was added in pandas 2.2") + s, e = "2001-01-01", "2008-01-31" pd.date_range(start=s, end=e, freq=freqstr_unsupported) with pytest.raises(ValueError, match="does not yet support"): @@ -1654,7 +1677,7 @@ def test_date_range_raise_unsupported(freqstr_unsupported): if freqstr_unsupported != "3MS": freqstr_unsupported = freqstr_unsupported.lower() with pytest.raises(ValueError, match="does not yet support"): - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): cudf.date_range(start=s, end=e, freq=freqstr_unsupported) @@ -1995,6 +2018,10 @@ def test_first(idx, offset): ) ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_first_start_at_end_of_month(idx, offset): p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) @@ -2319,6 +2346,10 @@ def test_datetime_to_str(data, dtype): assert_eq(actual.to_pandas(nullable=True), expected) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_datetime_string_to_datetime_resolution_loss_raises(): data = ["2020-01-01 00:00:00.00001"] dtype = "datetime64[s]" diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 794660cffcb..5d3d18cbe95 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -11,6 +11,7 @@ from packaging import version import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning") @@ -96,6 +97,10 @@ def prinoptions(cls): itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), ids=lambda docstring: docstring.name, ) + @pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Doctests not expected to pass on older versions of pandas", + ) def test_docstring(self, docstring): # We ignore differences in whitespace in the doctest output, and enable # the use of an ellipsis "..." to match any string in the doctest diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 74f04c0584f..0aaa71e50d7 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -188,6 +188,10 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index): @pytest.mark.parametrize("engine", ["cudf", "jit"]) @pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_as_index_apply(pdf, gdf, as_index, engine): gdf = gdf.groupby("y", as_index=as_index).apply( lambda df: df["x"].mean(), engine=engine @@ -298,6 +302,10 @@ def assert_values_equal(arr): assert_values_equal(pddf[k].values) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply(): np.random.seed(0) df = DataFrame() @@ -338,6 +346,10 @@ def f3(df, k, L, m): @pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params()) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_args(func, args): np.random.seed(0) df = DataFrame() @@ -500,6 +512,10 @@ def func(df): "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_jit_unary_reductions( func, dtype, dataset, groupby_jit_datasets ): @@ -530,6 +546,10 @@ def func(df): # test unary index reductions for special values +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def groupby_apply_jit_idx_reductions_special_vals_inner( func, data, dtype, special_val ): @@ -555,6 +575,10 @@ def func(df): @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): @@ -583,6 +607,10 @@ def test_groupby_apply_jit_reductions_special_vals( ], ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="include_groups keyword new in pandas 2.2", +) def test_groupby_apply_jit_idx_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): @@ -593,6 +621,10 @@ def test_groupby_apply_jit_idx_reductions_special_vals( @pytest.mark.parametrize("dtype", ["int32"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_sum_integer_overflow(dtype): max = np.iinfo(dtype).max @@ -627,6 +659,10 @@ def func(group): "large", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): dataset = groupby_jit_datasets[dataset] @@ -653,6 +689,10 @@ def func(group): @pytest.mark.parametrize("dtype", ["int32", "int64"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_correlation_zero_variance(dtype): # pearson correlation is undefined when the variance of either # variable is zero. This test ensures that the jit implementation @@ -711,6 +751,10 @@ def func(group): @pytest.mark.parametrize("dtype", ["uint8", "str"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_unsupported_dtype(dtype): df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df["b"] = df["b"].astype(dtype) @@ -739,6 +783,10 @@ def func(group): lambda df: df["val1"].mean() + df["val2"].std(), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_basic(func, groupby_jit_data_small): run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) @@ -759,12 +807,20 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): run_groupby_apply_jit_test( groupby_jit_data_small, func, ["key1", "key2"], *args ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_block_divergence(): # https://github.com/rapidsai/cudf/issues/12686 df = cudf.DataFrame( @@ -782,6 +838,10 @@ def diverging_block(grp_df): run_groupby_apply_jit_test(df, diverging_block, ["a"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_caching(): # Make sure similar functions that differ # by simple things like constants actually @@ -818,6 +878,10 @@ def f(group): assert precompiled.currsize == 3 +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_no_bytecode_fallback(): # tests that a function which contains no bytecode # attribute, but would still be executable using @@ -836,6 +900,10 @@ def f(group): assert_groupby_results_equal(expect, got) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_col_from_df(): # tests a UDF that consists of purely colwise # ops, such as `lambda group: group.x + group.y` @@ -862,6 +930,10 @@ def func(df): @pytest.mark.parametrize("func", [lambda group: group.sum()]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_df(func): # tests a UDF that reduces over a dataframe # and produces a series with the original column names @@ -1940,6 +2012,10 @@ def test_groupby_agg_combinations(agg): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_noempty_group(): pdf = pd.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} @@ -2208,6 +2284,10 @@ def f3(x, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_return_scalars_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_scalars(func, args): pdf = pd.DataFrame( { @@ -2266,6 +2346,10 @@ def f5(x, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_return_series_dataframe_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_return_series_dataframe(func, args): pdf = pd.DataFrame( {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} @@ -2744,6 +2828,10 @@ def test_groupby_diff_row_zero_shift(nelem): # TODO: test for category columns when cudf.Scalar supports category type @pytest.mark.parametrize("nelem", [10, 100, 1000]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_fillna_multi_value(nelem): t = rand_dataframe( dtypes_meta=[ @@ -2790,6 +2878,10 @@ def test_groupby_fillna_multi_value(nelem): # TODO: test for category columns when cudf.Scalar supports category type # TODO: cudf.fillna does not support decimal column to column fill yet @pytest.mark.parametrize("nelem", [10, 100, 1000]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_fillna_multi_value_df(nelem): t = rand_dataframe( dtypes_meta=[ @@ -2843,6 +2935,10 @@ def test_groupby_fillna_multi_value_df(nelem): "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] ) @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -3146,6 +3242,10 @@ def test_groupby_freq_s(label, closed): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings only given on newer versions.", +) def test_groupby_get_group(pdf, group, name, obj): gdf = cudf.from_pandas(pdf) @@ -3644,6 +3744,10 @@ def test_group_by_pandas_sort_order(groups, sort): "last", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_group_by_empty_reduction(dtype, reduce_op): gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) pdf = gdf.to_pandas() @@ -3664,6 +3768,10 @@ def test_group_by_empty_reduction(dtype, reduce_op): "apply_op", ["sum", "min", "max", "idxmax"], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_group_by_empty_apply(request, dtype, apply_op): request.applymarker( pytest.mark.xfail( @@ -3719,6 +3827,10 @@ def test_groupby_consecutive_operations(): assert_groupby_results_equal(actual, expected, check_dtype=False) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning only given on newer versions.", +) def test_categorical_grouping_pandas_compatibility(): gdf = cudf.DataFrame( { diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 722a64cb553..3f483219423 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,6 +16,11 @@ import cudf from cudf.api.extensions import no_default +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -791,9 +796,27 @@ def test_index_to_series(data): "name_data,name_other", [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_index_difference(data, other, sort, name_data, name_other): pd_data = pd.Index(data, name=name_data) pd_other = pd.Index(other, name=name_other) + if ( + not PANDAS_GE_220 + and isinstance(pd_data.dtype, pd.CategoricalDtype) + and not isinstance(pd_other.dtype, pd.CategoricalDtype) + and pd_other.isnull().any() + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318") + + if ( + not PANDAS_GE_220 + and len(pd_other) == 0 + and len(pd_data) != len(pd_data.unique()) + ): + pytest.skip(reason="Bug fixed in pandas-2.2+") gd_data = cudf.from_pandas(pd_data) gd_other = cudf.from_pandas(pd_other) @@ -1017,6 +1040,10 @@ def test_index_equal_misc(data, other): ["abcd", "defgh", "werty", "poiu"], ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_index_append(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) @@ -1220,6 +1247,10 @@ def test_index_append_error(data, other): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_index_append_list(data, other): pd_data = data pd_other = other @@ -2084,6 +2115,10 @@ def test_get_indexer_multi_numeric_deviate(key, method): @pytest.mark.parametrize("method", ["ffill", "bfill"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_get_indexer_multi_error(method): pi = pd.MultiIndex.from_tuples( [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] @@ -2527,7 +2562,7 @@ def test_isin_index(index, values): ) with expect_warning_if(is_dt_str): got = gidx.isin(values) - with expect_warning_if(is_dt_str): + with expect_warning_if(PANDAS_GE_220 and is_dt_str): expected = pidx.isin(values) assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 9df2852dde8..00ae99466bb 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1016,6 +1016,10 @@ def test_series_setitem_iloc(key, value, nulls): (slice(0, 2), [0.5, 0.25]), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_series_setitem_dtype(key, value): psr = pd.Series([1, 2, 3], dtype="int32") gsr = cudf.from_pandas(psr) @@ -1634,6 +1638,10 @@ def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe( assert_eq(expected, actual) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="No warning in older versions of pandas", +) def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) pdf = gdf.to_pandas() diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index a4f0b9fc97e..c76a49103e2 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -125,6 +125,10 @@ def test_interpolate_series_values_or_index(data, index, method): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not fail on older versions of pandas", +) def test_interpolate_dataframe_error_cases(data, kwargs): gsr = cudf.DataFrame(data) psr = gsr.to_pandas() diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 2d194107658..5e1dd33fbf1 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -6,6 +6,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.testing import assert_eq @@ -168,6 +169,10 @@ def test_interval_index_unique(): @pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex]) @pytest.mark.parametrize("tz", ["US/Eastern", None]) +@pytest.mark.skipif( + condition=not PANDAS_GE_220, + reason="ME frequency new in pandas 2.2", +) def test_interval_with_datetime(tz, box): dti = pd.date_range( start=pd.Timestamp("20180101", tz=tz), diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py index 9ea4ba007d2..9a95f0e01ab 100644 --- a/python/cudf/cudf/tests/test_join_order.py +++ b/python/cudf/cudf/tests/test_join_order.py @@ -1,13 +1,19 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. import itertools +import operator import string +from collections import defaultdict import numpy as np import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.testing import assert_eq @@ -35,10 +41,124 @@ def right(): # Behaviour in sort=False case didn't match documentation in many # cases prior to https://github.com/pandas-dev/pandas/pull/54611 # (released as part of pandas 2.2) -def expected(left, right, sort, *, how): - left = left.to_pandas() - right = right.to_pandas() - return left.merge(right, on="key", how=how, sort=sort) +if PANDAS_GE_220: + # Behaviour in sort=False case didn't match documentation in many + # cases prior to https://github.com/pandas-dev/pandas/pull/54611 + # (released as part of pandas 2.2) + def expected(left, right, sort, *, how): + left = left.to_pandas() + right = right.to_pandas() + return left.merge(right, on="key", how=how, sort=sort) + +else: + + def expect_inner(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + continue + for i in right_have[k]: + keys.append(k) + val_x.append(v) + val_y.append(right_val[i]) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_left(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_outer(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + left_have = set(left_key) + for k, v in zip(right_key, right_val): + if k not in left_have: + keys.append(k) + val_x.append(None) + val_y.append(v) + + # Python sort is stable, so this will preserve input order for + # equal items. + # outer joins are always sorted, but we test both sort values + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expected(left, right, sort, *, how): + if how == "inner": + return expect_inner(left, right, sort) + elif how == "outer": + return expect_outer(left, right, sort) + elif how == "left": + return expect_left(left, right, sort) + elif how == "right": + return expect_left(right, left, sort).rename( + {"val_x": "val_y", "val_y": "val_x"}, axis=1 + ) + else: + raise NotImplementedError() @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py index 7dd25ebc500..055bc5757b3 100644 --- a/python/cudf/cudf/tests/test_mvc.py +++ b/python/cudf/cudf/tests/test_mvc.py @@ -1,8 +1,9 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import subprocess import sys import pytest +from packaging import version IS_CUDA_11 = False IS_CUDA_12 = False @@ -14,9 +15,12 @@ # do not test cuda 12 if pynvjitlink isn't present HAVE_PYNVJITLINK = False try: + import numba import pynvjitlink # noqa: F401 - HAVE_PYNVJITLINK = True + HAVE_PYNVJITLINK = version.parse(numba.__version__) >= version.parse( + "0.58" + ) except ModuleNotFoundError: pass diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 1b0589254f5..b1a2f081cd2 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.testing import assert_eq from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -373,7 +374,7 @@ def test_to_numeric_error(data, errors): ): cudf.to_numeric(data, errors=errors) else: - with expect_warning_if(errors == "ignore"): + with expect_warning_if(PANDAS_GE_220 and errors == "ignore"): expect = pd.to_numeric(data, errors=errors) with expect_warning_if(errors == "ignore"): got = cudf.to_numeric(data, errors=errors) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index e0884a5819a..c2a30b76bea 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1679,7 +1679,13 @@ def run_orc_columns_and_index_param(index_obj, index, columns): "columns", [ None, - [], + pytest.param( + [], + marks=pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Bug in older version of pandas", + ), + ), ], ) def test_orc_columns_and_index_param(index_obj, index, columns): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6623c537ddf..8b59a7eef08 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -23,6 +23,7 @@ import cudf from cudf._lib.parquet import read_parquet_chunked +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -3034,6 +3035,10 @@ def test_parquet_reader_rle_boolean(datadir): # a list column in a schema, the cudf reader was confusing # nesting information between a list column and a subsequent # string column, ultimately causing a crash. +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Older versions of pandas do not have DataFrame.map()", +) def test_parquet_reader_one_level_list2(datadir): # we are reading in a file containing binary types, but cudf returns # those as strings. so we have to massage the pandas data to get diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index a70a2ea15dd..f276f394cd0 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -10,6 +10,7 @@ import cudf from cudf import Series +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import _utils as utils, assert_eq from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand @@ -342,6 +343,10 @@ def test_any_all_axis_none(data, op): "median", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) def test_reductions_axis_none_warning(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) pdf = df.to_pandas() diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e5ee0127a74..3a8928297c0 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -10,7 +10,11 @@ import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -66,7 +70,7 @@ def test_series_replace_all(gsr, to_replace, value): ) with expect_warning_if(expect_warn): actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - with expect_warning_if(expect_warn): + with expect_warning_if(expect_warn and PANDAS_GE_220): if pd_value is None: # TODO: Remove this workaround once cudf # introduces `no_default` values @@ -91,7 +95,7 @@ def test_series_replace(): # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): psr4 = psr3.replace("one", "two") sr3 = cudf.from_pandas(psr3) with pytest.warns(FutureWarning): @@ -100,7 +104,7 @@ def test_series_replace(): psr4.sort_values().reset_index(drop=True), sr4.sort_values().reset_index(drop=True), ) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): psr5 = psr3.replace("one", "five") with pytest.warns(FutureWarning): sr5 = sr3.replace("one", "five") @@ -517,7 +521,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): pd.date_range( "2010-01-01", "2020-01-10", - freq="1YE", + freq="1YE" if PANDAS_GE_220 else "1y", ) ), pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), @@ -564,7 +568,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): pd.date_range( "2010-01-01", "2020-01-10", - freq="1YE", + freq="1YE" if PANDAS_GE_220 else "1y", ) ) + pd.Timedelta("1d"), @@ -1069,6 +1073,10 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) def test_replace_inplace(pframe, replace_args): gpu_frame = cudf.from_pandas(pframe) pandas_frame = pframe.copy() diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index 95fa8e9a50a..a61477981f8 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq @@ -147,6 +148,10 @@ def test_dataframe_resample_level(): ("10D", "1D", "s"), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): # test that we cast to the appropriate frequency # when resampling: @@ -164,6 +169,10 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]") +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_resampling_downsampling_ms(): pdf = pd.DataFrame( { diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 50db4302b75..4235affd4d1 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -8,10 +8,19 @@ import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing import assert_eq -from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES +from cudf.testing._utils import ( + ALL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + expect_warning_if, +) pytest_xfail = pytest.mark.xfail pytestmark = pytest.mark.spilling @@ -220,7 +229,7 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna): with pytest.warns(FutureWarning): got = gdf.stack(level=level, dropna=dropna, future_stack=False) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = pdf.stack(level=level, dropna=dropna, future_stack=False) assert_eq(expect, got, check_dtype=False) @@ -265,7 +274,7 @@ def test_df_stack_multiindex_column_axis_pd_example(level): df = pd.DataFrame(np.random.randn(4, 4), columns=columns) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = df.stack(level=level, future_stack=False) gdf = cudf.from_pandas(df) with pytest.warns(FutureWarning): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index d5f63fdab77..f952cea07f8 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -447,6 +447,10 @@ def test_cov1d(data1, data2): ], ) @pytest.mark.parametrize("method", ["spearman", "pearson"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings missing on older pandas (scipy version seems unrelated?)", +) def test_corr1d(data1, data2, method): if method == "spearman": # Pandas uses scipy.stats.spearmanr code-path @@ -585,6 +589,10 @@ def test_min_count_ops(data, ops, skipna, min_count): ], ) @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_cov_corr_datetime_timedelta(data1, data2, dtype): gsr1 = cudf.Series(data1, dtype=dtype) gsr2 = cudf.Series(data2, dtype=dtype) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 505d5d0b9cc..d10c531d757 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -23,6 +23,7 @@ from numba import NumbaDeprecationWarning from pytz import utc +from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object @@ -536,12 +537,15 @@ def test_array_ufunc(series): @pytest.mark.xfail(strict=False, reason="Fails in CI, passes locally.") def test_groupby_apply_func_returns_series(dataframe): pdf, df = dataframe + if PANDAS_GE_220: + kwargs = {"include_groups": False} + else: + kwargs = {} + expect = pdf.groupby("a").apply( - lambda group: pd.Series({"x": 1}), include_groups=False - ) - got = df.groupby("a").apply( - lambda group: xpd.Series({"x": 1}), include_groups=False + lambda group: pd.Series({"x": 1}), **kwargs ) + got = df.groupby("a").apply(lambda group: xpd.Series({"x": 1}), **kwargs) tm.assert_equal(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py index d84235481c3..e4e79b7b8cf 100644 --- a/python/dask_cudf/dask_cudf/tests/test_applymap.py +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -5,6 +5,8 @@ from dask import dataframe as dd +from cudf.core._compat import PANDAS_GE_210 + from dask_cudf.tests.utils import _make_random_frame @@ -18,6 +20,10 @@ ], ) @pytest.mark.parametrize("has_na", [True, False]) +@pytest.mark.skipif( + not PANDAS_GE_210, + reason="DataFrame.map requires pandas>=2.1.0", +) def test_applymap_basic(func, has_na): size = 2000 pdf, dgdf = _make_random_frame(size, include_na=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index be10b0d4843..d03180852eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -80,6 +80,11 @@ def test_str_series_roundtrip(): def test_p2p_shuffle(): + pytest.importorskip( + "pyarrow", + minversion="14.0.1", + reason="P2P shuffling requires pyarrow>=14.0.1", + ) # Check that we can use `shuffle_method="p2p"` with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: with Client(cluster): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cf916b713b2..7b9f0ca328a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -9,6 +9,7 @@ from dask.utils_test import hlg_layer import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing._utils import expect_warning_if import dask_cudf @@ -316,6 +317,10 @@ def test_groupby_dropna_cudf(dropna, by): (None, ["a", "d"]), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_dropna_dask(dropna, by): # NOTE: This test is borrowed from upstream dask # (dask/dask/dataframe/tests/test_groupby.py) From e1ab1e799d7a29289419014e19ec5c6f2e99ae91 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:48:03 -0400 Subject: [PATCH 02/19] Make isinstance check pass for proxy ndarrays (#16601) Closes #14537. Authors: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16601 --- python/cudf/cudf/pandas/_wrappers/numpy.py | 23 +++++++++ python/cudf/cudf/pandas/fast_slow_proxy.py | 26 +++++++++- python/cudf/cudf/pandas/proxy_base.py | 22 ++++++++ .../cudf_pandas_tests/test_cudf_pandas.py | 50 ++++++++++++++++++- 4 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/pandas/proxy_base.py diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index 90ac5198270..d5e669cb58f 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -10,10 +10,13 @@ from packaging import version from ..fast_slow_proxy import ( + _fast_slow_function_call, _FastSlowAttribute, + is_proxy_object, make_final_proxy_type, make_intermediate_proxy_type, ) +from ..proxy_base import ProxyNDarrayBase from .common import ( array_interface, array_method, @@ -105,18 +108,38 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor): return super(cls, cls)._fsproxy_wrap(arr, constructor) +def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs): + result, _ = _fast_slow_function_call( + getattr(ufunc, method), + *inputs, + **kwargs, + ) + if isinstance(result, tuple): + if is_proxy_object(result[0]) and isinstance( + result[0]._fsproxy_wrapped, numpy.ndarray + ): + return tuple(numpy.asarray(x) for x in result) + elif is_proxy_object(result) and isinstance( + result._fsproxy_wrapped, numpy.ndarray + ): + return numpy.asarray(result) + return result + + ndarray = make_final_proxy_type( "ndarray", cupy.ndarray, numpy.ndarray, fast_to_slow=cupy.ndarray.get, slow_to_fast=cupy.asarray, + bases=(ProxyNDarrayBase,), additional_attributes={ "__array__": array_method, # So that pa.array(wrapped-numpy-array) works "__arrow_array__": arrow_array_method, "__cuda_array_interface__": cuda_array_interface, "__array_interface__": array_interface, + "__array_ufunc__": ndarray__array_ufunc__, # ndarrays are unhashable "__hash__": None, # iter(cupy-array) produces an iterable of zero-dim device diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 4b0fd9a5b36..afa1ce5f86c 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -19,6 +19,7 @@ from ..options import _env_get_bool from ..testing import assert_eq from .annotation import nvtx +from .proxy_base import ProxyNDarrayBase def call_operator(fn, args, kwargs): @@ -564,7 +565,17 @@ def _fsproxy_wrap(cls, value, func): _FinalProxy subclasses can override this classmethod if they need particular behaviour when wrapped up. """ - proxy = object.__new__(cls) + # TODO: Replace the if-elif-else using singledispatch helper function + base_class = _get_proxy_base_class(cls) + if base_class is object: + proxy = base_class.__new__(cls) + elif base_class is ProxyNDarrayBase: + proxy = base_class.__new__(cls, value) + else: + raise TypeError( + f"Cannot create an proxy instance of {cls.__name__} using base class {base_class.__name__}. " + f"Expected either 'object' or another type in 'PROXY_BASE_CLASSES'" + ) proxy._fsproxy_wrapped = value return proxy @@ -1193,6 +1204,19 @@ def is_proxy_object(obj: Any) -> bool: return False +def _get_proxy_base_class(cls): + """Returns the proxy base class if one exists""" + for proxy_class in PROXY_BASE_CLASSES: + if proxy_class in cls.__mro__: + return proxy_class + return object + + +PROXY_BASE_CLASSES: set[type] = { + ProxyNDarrayBase, +} + + NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py new file mode 100644 index 00000000000..6f732834e94 --- /dev/null +++ b/python/cudf/cudf/pandas/proxy_base.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import cupy as cp +import numpy as np + + +class ProxyNDarrayBase(np.ndarray): + def __new__(cls, arr): + if isinstance(arr, cp.ndarray): + arr = arr.get() + if not isinstance(arr, np.ndarray): + raise TypeError( + "Unsupported array type. Must be numpy.ndarray or cupy.ndarray" + ) + return np.asarray(arr, dtype=arr.dtype).view(cls) + + def __array_finalize__(self, obj): + if obj is None: + return + self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", obj) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index d10c531d757..c4ab4b0a853 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -14,18 +14,20 @@ import types from io import BytesIO, StringIO +import cupy as cp import jupyter_client import nbformat import numpy as np import pyarrow as pa import pytest from nbconvert.preprocessors import ExecutePreprocessor -from numba import NumbaDeprecationWarning +from numba import NumbaDeprecationWarning, vectorize from pytz import utc from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object +from cudf.testing import assert_eq if not LOADED: raise ImportError("These tests must be run with cudf.pandas loaded") @@ -1690,3 +1692,49 @@ def test_notebook_slow_repr(): assert ( string in html_result ), f"Expected string {string} not found in the output" + + +def test_numpy_ndarray_isinstancecheck(array): + arr1, arr2 = array + assert isinstance(arr1, np.ndarray) + assert isinstance(arr2, np.ndarray) + + +def test_numpy_ndarray_np_ufunc(array): + arr1, arr2 = array + + @np.vectorize + def add_one_ufunc(arr): + return arr + 1 + + assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2)) + + +def test_numpy_ndarray_cp_ufunc(array): + arr1, arr2 = array + + @cp.vectorize + def add_one_ufunc(arr): + return arr + 1 + + assert_eq(add_one_ufunc(cp.asarray(arr1)), add_one_ufunc(arr2)) + + +def test_numpy_ndarray_numba_ufunc(array): + arr1, arr2 = array + + @vectorize + def add_one_ufunc(arr): + return arr + 1 + + assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2)) + + +def test_numpy_ndarray_numba_cuda_ufunc(array): + arr1, arr2 = array + + @vectorize(["int64(int64)"], target="cuda") + def add_one_ufunc(a): + return a + 1 + + assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2))) From 949f1719226f0b27a4df8fedbf4624f46fb0589d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:52:01 -0400 Subject: [PATCH 03/19] Performance improvement for strings::slice for wide strings (#16574) Improves performance of wide strings (avg > 64 bytes) when using `cudf::strings::slice_strings`. Addresses some concerns from issue #15924 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/16574 --- cpp/src/strings/slice.cu | 182 ++++++++++++++++++++++++++++++--------- 1 file changed, 141 insertions(+), 41 deletions(-) diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index cf82a837c51..d8324a9b08e 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,8 @@ #include #include +#include +#include #include #include #include @@ -40,6 +43,9 @@ namespace cudf { namespace strings { namespace detail { namespace { + +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128; + /** * @brief Function logic for compute_substrings_from_fn API * @@ -51,17 +57,19 @@ struct substring_from_fn { IndexIterator const starts; IndexIterator const stops; - __device__ string_view operator()(size_type idx) const + __device__ string_index_pair operator()(size_type idx) const { - if (d_column.is_null(idx)) { return string_view{nullptr, 0}; } + if (d_column.is_null(idx)) { return string_index_pair{nullptr, 0}; } auto const d_str = d_column.template element(idx); auto const length = d_str.length(); auto const start = std::max(starts[idx], 0); - if (start >= length) { return string_view{}; } + if (start >= length) { return string_index_pair{"", 0}; } - auto const stop = stops[idx]; - auto const end = (((stop < 0) || (stop > length)) ? length : stop); - return start < end ? d_str.substr(start, end - start) : string_view{}; + auto const stop = stops[idx]; + auto const end = (((stop < 0) || (stop > length)) ? length : stop); + auto const sub_str = start < end ? d_str.substr(start, end - start) : string_view{}; + return sub_str.empty() ? string_index_pair{"", 0} + : string_index_pair{sub_str.data(), sub_str.size_bytes()}; } substring_from_fn(column_device_view const& d_column, IndexIterator starts, IndexIterator stops) @@ -70,6 +78,82 @@ struct substring_from_fn { } }; +template +CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings, + IndexIterator starts, + IndexIterator stops, + string_index_pair* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } + + namespace cg = cooperative_groups; + auto const warp = cg::tiled_partition(cg::this_thread_block()); + + if (d_strings.is_null(str_idx)) { + if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{nullptr, 0}; } + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{"", 0}; } + return; + } + + auto const start = max(starts[str_idx], 0); + auto stop = [stop = stops[str_idx]] { + return (stop < 0) ? std::numeric_limits::max() : stop; + }(); + auto const end = d_str.data() + d_str.size_bytes(); + + auto start_counts = thrust::make_pair(0, 0); + auto stop_counts = thrust::make_pair(0, 0); + + auto itr = d_str.data() + warp.thread_rank(); + + size_type char_count = 0; + size_type byte_count = 0; + while (byte_count < d_str.size_bytes()) { + if (char_count <= start) { start_counts = {char_count, byte_count}; } + if (char_count <= stop) { + stop_counts = {char_count, byte_count}; + } else { + break; + } + size_type const cc = (itr < end) && is_begin_utf8_char(*itr); + size_type const bc = (itr < end); + char_count += cg::reduce(warp, cc, cg::plus()); + byte_count += cg::reduce(warp, bc, cg::plus()); + itr += cudf::detail::warp_size; + } + + if (warp.thread_rank() == 0) { + if (start >= char_count) { + d_output[str_idx] = string_index_pair{"", 0}; + return; + } + + // we are just below start/stop and must now increment up to it from here + auto first_byte = start_counts.second; + if (start_counts.first < start) { + auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte); + first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first)); + } + + stop = max(stop, char_count); + auto last_byte = stop_counts.second; + if (stop_counts.first < stop) { + auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte); + last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first)); + } + + d_output[str_idx] = (first_byte < last_byte) + ? string_index_pair{d_str.data() + first_byte, last_byte - first_byte} + : string_index_pair{"", 0}; + } +} + /** * @brief Function logic for the substring API. * @@ -149,54 +233,67 @@ struct substring_fn { * * @tparam IndexIterator Iterator type for character position values * - * @param d_column Input strings column to substring + * @param input Input strings column to substring * @param starts Start positions index iterator * @param stops Stop positions index iterator * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory */ template -std::unique_ptr compute_substrings_from_fn(column_device_view const& d_column, +std::unique_ptr compute_substrings_from_fn(strings_column_view const& input, IndexIterator starts, IndexIterator stops, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto results = rmm::device_uvector(d_column.size(), stream); - thrust::transform(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(d_column.size()), - results.begin(), - substring_from_fn{d_column, starts, stops}); - return make_strings_column(results, string_view{nullptr, 0}, stream, mr); + auto results = rmm::device_uvector(input.size(), stream); + + auto const d_column = column_device_view::create(input.parent(), stream); + + if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + results.begin(), + substring_from_fn{*d_column, starts, stops}); + } else { + constexpr thread_index_type block_size = 512; + auto const threads = + static_cast(input.size()) * cudf::detail::warp_size; + auto const num_blocks = util::div_rounding_up_safe(threads, block_size); + substring_from_kernel + <<>>(*d_column, starts, stops, results.data()); + } + return make_strings_column(results.begin(), results.end(), stream, mr); } } // namespace // -std::unique_ptr slice_strings(strings_column_view const& strings, +std::unique_ptr slice_strings(strings_column_view const& input, numeric_scalar const& start, numeric_scalar const& stop, numeric_scalar const& step, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); + if (input.size() == input.null_count()) { + return std::make_unique(input.parent(), stream, mr); + } auto const step_valid = step.is_valid(stream); - auto const step_value = step_valid ? step.value(stream) : 0; + auto const step_value = step_valid ? step.value(stream) : 1; if (step_valid) { CUDF_EXPECTS(step_value != 0, "Step parameter must not be 0"); } - auto const d_column = column_device_view::create(strings.parent(), stream); - // optimization for (step==1 and start < stop) -- expect this to be most common - if (step_value == 1 and start.is_valid(stream) and stop.is_valid(stream)) { - auto const start_value = start.value(stream); - auto const stop_value = stop.value(stream); + if (step_value == 1) { + auto const start_value = start.is_valid(stream) ? start.value(stream) : 0; + auto const stop_value = + stop.is_valid(stream) ? stop.value(stream) : std::numeric_limits::max(); // note that any negative values here must use the alternate function below if ((start_value >= 0) && (start_value < stop_value)) { // this is about 2x faster on long strings for this common case - return compute_substrings_from_fn(*d_column, + return compute_substrings_from_fn(input, thrust::constant_iterator(start_value), thrust::constant_iterator(stop_value), stream, @@ -204,31 +301,35 @@ std::unique_ptr slice_strings(strings_column_view const& strings, } } + auto const d_column = column_device_view::create(input.parent(), stream); + auto const d_start = get_scalar_device_view(const_cast&>(start)); auto const d_stop = get_scalar_device_view(const_cast&>(stop)); auto const d_step = get_scalar_device_view(const_cast&>(step)); auto [offsets, chars] = make_strings_children( - substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr); + substring_fn{*d_column, d_start, d_stop, d_step}, input.size(), stream, mr); - return make_strings_column(strings.size(), + return make_strings_column(input.size(), std::move(offsets), chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } -std::unique_ptr slice_strings(strings_column_view const& strings, +std::unique_ptr slice_strings(strings_column_view const& input, column_view const& starts_column, column_view const& stops_column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); - CUDF_EXPECTS(starts_column.size() == strings_count, + if (input.size() == input.null_count()) { + return std::make_unique(input.parent(), stream, mr); + } + + CUDF_EXPECTS(starts_column.size() == input.size(), "Parameter starts must have the same number of rows as strings."); - CUDF_EXPECTS(stops_column.size() == strings_count, + CUDF_EXPECTS(stops_column.size() == input.size(), "Parameter stops must have the same number of rows as strings."); CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column), "Parameters starts and stops must be of the same type.", @@ -242,17 +343,16 @@ std::unique_ptr slice_strings(strings_column_view const& strings, "Positions values must be fixed width type.", cudf::data_type_error); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column); - auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stops_column); - return compute_substrings_from_fn(*strings_column, starts_iter, stops_iter, stream, mr); + auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column); + auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stops_column); + return compute_substrings_from_fn(input, starts_iter, stops_iter, stream, mr); } } // namespace detail // external API -std::unique_ptr slice_strings(strings_column_view const& strings, +std::unique_ptr slice_strings(strings_column_view const& input, numeric_scalar const& start, numeric_scalar const& stop, numeric_scalar const& step, @@ -260,17 +360,17 @@ std::unique_ptr slice_strings(strings_column_view const& strings, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, stream, mr); + return detail::slice_strings(input, start, stop, step, stream, mr); } -std::unique_ptr slice_strings(strings_column_view const& strings, +std::unique_ptr slice_strings(strings_column_view const& input, column_view const& starts_column, column_view const& stops_column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, starts_column, stops_column, stream, mr); + return detail::slice_strings(input, starts_column, stops_column, stream, mr); } } // namespace strings From 0cc059fb2b81adbdc9593052292838995dc78b10 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 5 Sep 2024 15:07:29 -0700 Subject: [PATCH 04/19] Upgrade to nvcomp 4.0.1 (#16076) This PR bumps nvcomp to 4.0.1. Depends on: - https://github.com/conda-forge/nvcomp-feedstock/pull/15 - https://github.com/rapidsai/rapids-cmake/pull/633 - https://github.com/rapidsai/kvikio/pull/449 Authors: - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Peixin (https://github.com/pxLi) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/16076 --- ci/build_wheel_cudf.sh | 2 -- ci/build_wheel_pylibcudf.sh | 2 -- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- dependencies.yaml | 2 +- java/pom.xml | 4 +--- java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java | 3 --- java/src/main/native/CMakeLists.txt | 5 ++--- python/libcudf/CMakeLists.txt | 3 +-- 10 files changed, 8 insertions(+), 19 deletions(-) diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index e5565c4b53c..fb93b06dbe2 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -23,8 +23,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ - --exclude libnvcomp_bitcomp.so \ - --exclude libnvcomp_gdeflate.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 0e4745bda28..5e9f7f8a0c4 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -21,8 +21,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ - --exclude libnvcomp_bitcomp.so \ - --exclude libnvcomp_gdeflate.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 7f6967d7287..fa4c77d67b4 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -58,7 +58,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==3.0.6 +- nvcomp==4.0.1 - nvtx>=0.2.1 - openpyxl - packaging diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index c1315e73f16..9b487347a5e 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -56,7 +56,7 @@ dependencies: - numba>=0.57 - numpy>=1.23,<3.0a0 - numpydoc -- nvcomp==3.0.6 +- nvcomp==4.0.1 - nvtx>=0.2.1 - openpyxl - packaging diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 4b1c4cca828..dae04c08aca 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -35,7 +35,7 @@ spdlog_version: - ">=1.12.0,<1.13" nvcomp_version: - - "=3.0.6" + - "=4.0.1" zlib_version: - ">=1.2.13" diff --git a/dependencies.yaml b/dependencies.yaml index f8b231efd6d..a3f0ffeec82 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -354,7 +354,7 @@ dependencies: - flatbuffers==24.3.25 - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - - nvcomp==3.0.6 + - nvcomp==4.0.1 - spdlog>=1.12.0,<1.13 rapids_build_skbuild: common: diff --git a/java/pom.xml b/java/pom.xml index 9694e741f16..e4f1cdf64e7 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1,6 +1,6 @@