Prevent converting strings to arrow strings in dask_cudf pytests (r…

…apidsai#14914) dask is natively converting all object types to arrow[string] types if proper pyarrow dependency is detected. This is being done in assert_eq API. We will need a change in cudf and dask upstream to be able to support this kind of conversion. I'm coming up with a solution in 24.04 dev cycle, but in the interest of shipping pandas-2.x I'm feeling confident to disable this auto-conversion by setting the dataframe.convert-string dask config to False where necessary.
singhmanas1 · Jan 29, 2024 · 9fa9dc5 · 9fa9dc5
1 parent d8df8e4
commit 9fa9dc5
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 44 deletions.
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -226,11 +226,11 @@ def test_read_csv_skiprows_error(csv_begin_bad_lines):
 
 def test_read_csv_skipfooter(csv_end_bad_lines):
     # Repro from Issue#13552
+    with dask.config.set({"dataframe.convert-string": False}):
+        ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute()
+        ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute()
 
-    ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute()
-    ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute()
-
-    dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False)
+        dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False)
 
 
 def test_read_csv_skipfooter_error(csv_end_bad_lines):

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import os
 
@@ -80,7 +80,9 @@ def test_read_json_nested(tmp_path):
         }
     )
     kwargs = dict(orient="records", lines=True)
-    with tmp_path / "data.json" as f:
+    with tmp_path / "data.json" as f, dask.config.set(
+        {"dataframe.convert-string": False}
+    ):
         df.to_json(f, **kwargs)
         # Ensure engine='cudf' is tested.
         actual = dask_cudf.read_json(f, engine="cudf", **kwargs)

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import glob
 import math

diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -2,7 +2,9 @@
 
 import numpy as np
 import pandas as pd
+import dask
 import pytest
+
 from pandas.testing import assert_series_equal
 
 from dask import dataframe as dd
@@ -137,30 +139,30 @@ def test_categorical_basic(data):
 4 a
 """
     assert all(x == y for x, y in zip(string.split(), expect_str.split()))
+    with dask.config.set({"dataframe.convert-string": False}):
+        df = DataFrame()
+        df["a"] = ["xyz", "abc", "def"] * 10
 
-    df = DataFrame()
-    df["a"] = ["xyz", "abc", "def"] * 10
-
-    pdf = df.to_pandas()
-    cddf = dgd.from_cudf(df, 1)
-    cddf["b"] = cddf["a"].astype("category")
+        pdf = df.to_pandas()
+        cddf = dgd.from_cudf(df, 1)
+        cddf["b"] = cddf["a"].astype("category")
 
-    ddf = dd.from_pandas(pdf, 1)
-    ddf["b"] = ddf["a"].astype("category")
+        ddf = dd.from_pandas(pdf, 1)
+        ddf["b"] = ddf["a"].astype("category")
 
-    assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])
+        assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])
 
-    with pytest.raises(NotImplementedError):
-        cddf["b"].cat.categories
+        with pytest.raises(NotImplementedError):
+            cddf["b"].cat.categories
 
-    with pytest.raises(NotImplementedError):
-        ddf["b"].cat.categories
+        with pytest.raises(NotImplementedError):
+            ddf["b"].cat.categories
 
-    cddf = cddf.categorize()
-    ddf = ddf.categorize()
+        cddf = cddf.categorize()
+        ddf = ddf.categorize()
 
-    assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
-    assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
+        assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
+        assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
 
 
 @pytest.mark.parametrize("data", [data_cat_1()])

diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -782,14 +782,16 @@ def test_dataframe_set_index():
     df["str"] = list("abcdefghijklmnopqrstuvwxyz")
     pdf = df.to_pandas()
 
-    ddf = dgd.from_cudf(df, npartitions=4)
-    ddf = ddf.set_index("str")
+    with dask.config.set({"dataframe.convert-string": False}):
+        ddf = dgd.from_cudf(df, npartitions=4)
+        ddf = ddf.set_index("str")
 
-    pddf = dd.from_pandas(pdf, npartitions=4)
-    pddf = pddf.set_index("str")
-    from cudf.testing._utils import assert_eq
+        pddf = dd.from_pandas(pdf, npartitions=4)
+        pddf = pddf.set_index("str")
+
+        from cudf.testing._utils import assert_eq
 
-    assert_eq(ddf.compute(), pddf.compute())
+        assert_eq(ddf.compute(), pddf.compute())
 
 
 def test_series_describe():
@@ -938,3 +940,15 @@ def test_categorical_dtype_round_trip():
     actual = ds.compute()
     expected = pds.compute()
     assert actual.dtype.ordered == expected.dtype.ordered
+
+
+def test_object_to_string_fail(request):
+    request.applymarker(
+        pytest.mark.xfail(
+            reason="https://github.com/rapidsai/cudf/issues/14915",
+        )
+    )
+    s = cudf.Series(["a", "b", "c"] * 10)
+    ds = dgd.from_cudf(s, npartitions=2)
+    pds = dd.from_pandas(s.to_pandas(), npartitions=2)
+    dd.assert_eq(ds.sort_values(), pds.sort_values())
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -610,7 +610,8 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         if as_index:
             # Groupby columns became the index.
             # Sorting the index should not change anything.
-            dd.assert_eq(gf.index, gf.sort_index().index)
+            with dask.config.set({"dataframe.convert-string": False}):
+                dd.assert_eq(gf.index, gf.sort_index().index)
         else:
             # Groupby columns are did NOT become the index.
             # Sorting by these columns should not change anything.

diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import pytest
 
+import dask
 from dask import dataframe as dd
 
 import cudf
@@ -69,16 +70,17 @@ def test_rowwise_reductions(data, op):
     gddf = dgd.from_cudf(data, npartitions=10)
     pddf = gddf.to_dask_dataframe()
 
-    if op in ("var", "std"):
-        expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0)
-        got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0)
-    else:
-        expected = getattr(pddf, op)(numeric_only=True, axis=1)
-        got = getattr(pddf, op)(numeric_only=True, axis=1)
-
-    dd.assert_eq(
-        expected,
-        got,
-        check_exact=False,
-        check_dtype=op not in ("var", "std"),
-    )
+    with dask.config.set({"dataframe.convert-string": False}):
+        if op in ("var", "std"):
+            expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0)
+            got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0)
+        else:
+            expected = getattr(pddf, op)(numeric_only=True, axis=1)
+            got = getattr(pddf, op)(numeric_only=True, axis=1)
+
+        dd.assert_eq(
+            expected,
+            got,
+            check_exact=False,
+            check_dtype=op not in ("var", "std"),
+        )