From aef1b46098bfab74473e2cb7c2f569c67e11628b Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 10 Sep 2024 13:24:53 -0700 Subject: [PATCH] fix cov/corr bug in dask-cudf --- python/dask_cudf/dask_cudf/expr/_collection.py | 18 +++++++++++++++++- python/dask_cudf/dask_cudf/tests/test_core.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index f60e4ff81ef..97e1dffc65b 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs): return self.to_backend("pandas", **kwargs) + def _prepare_cov_corr(self, min_periods, numeric_only): + # Upstream version of this method sets min_periods + # to 2 by default (which is not supported by cudf) + # TODO: Remove when cudf supports both min_periods + # and numeric_only + # See: https://github.com/rapidsai/cudf/issues/12626 + # See: https://github.com/rapidsai/cudf/issues/9009 + self._meta.cov(min_periods=min_periods) + + frame = self + if numeric_only: + numerics = self._meta._get_numeric_data() + if len(numerics.columns) != len(self.columns): + frame = frame[list(numerics.columns)] + return frame, min_periods + # var can be removed if cudf#15179 is addressed. - # See: https://github.com/rapidsai/cudf/issues/15179 + # See: https://github.com/rapidsai/cudf/issues/14935 def var( self, axis=0, diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 905d8c08135..7aa0f6320f2 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1007,3 +1007,20 @@ def test_to_backend_simplify(): df2 = df.to_backend("cudf")[["y"]].simplify() df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify() assert df2._name == df3._name + + +@pytest.mark.parametrize("numeric_only", [True, False]) +@pytest.mark.parametrize("op", ["corr", "cov"]) +def test_cov_corr(op, numeric_only): + df = cudf.DataFrame.from_dict( + { + "x": np.random.randint(0, 5, size=10), + "y": np.random.normal(size=10), + } + ) + ddf = dd.from_pandas(df, npartitions=2) + res = getattr(ddf, op)(numeric_only=numeric_only) + # Use to_pandas until cudf supports numeric_only + # (See: https://github.com/rapidsai/cudf/issues/12626) + expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) + dd.assert_eq(res, expect)