Skip to content

Commit

Permalink
Prevent converting strings to arrow strings in dask_cudf pytests (r…
Browse files Browse the repository at this point in the history
…apidsai#14914)

dask is natively converting all object types to arrow[string] types if proper pyarrow dependency is detected. This is being done in assert_eq API. We will need a change in cudf and dask upstream to be able to support this kind of conversion. I'm coming up with a solution in 24.04 dev cycle, but in the interest of shipping pandas-2.x I'm feeling confident to disable this auto-conversion by setting the dataframe.convert-string dask config to False where necessary.
  • Loading branch information
galipremsagar authored Jan 29, 2024
1 parent d8df8e4 commit 9fa9dc5
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 44 deletions.
8 changes: 4 additions & 4 deletions python/dask_cudf/dask_cudf/io/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,11 @@ def test_read_csv_skiprows_error(csv_begin_bad_lines):

def test_read_csv_skipfooter(csv_end_bad_lines):
# Repro from Issue#13552
with dask.config.set({"dataframe.convert-string": False}):
ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute()
ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute()

ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute()
ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute()

dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False)
dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False)


def test_read_csv_skipfooter_error(csv_end_bad_lines):
Expand Down
6 changes: 4 additions & 2 deletions python/dask_cudf/dask_cudf/io/tests/test_json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import os

Expand Down Expand Up @@ -80,7 +80,9 @@ def test_read_json_nested(tmp_path):
}
)
kwargs = dict(orient="records", lines=True)
with tmp_path / "data.json" as f:
with tmp_path / "data.json" as f, dask.config.set(
{"dataframe.convert-string": False}
):
df.to_json(f, **kwargs)
# Ensure engine='cudf' is tested.
actual = dask_cudf.read_json(f, engine="cudf", **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import glob
import math
Expand Down
36 changes: 19 additions & 17 deletions python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import numpy as np
import pandas as pd
import dask
import pytest

from pandas.testing import assert_series_equal

from dask import dataframe as dd
Expand Down Expand Up @@ -137,30 +139,30 @@ def test_categorical_basic(data):
4 a
"""
assert all(x == y for x, y in zip(string.split(), expect_str.split()))
with dask.config.set({"dataframe.convert-string": False}):
df = DataFrame()
df["a"] = ["xyz", "abc", "def"] * 10

df = DataFrame()
df["a"] = ["xyz", "abc", "def"] * 10

pdf = df.to_pandas()
cddf = dgd.from_cudf(df, 1)
cddf["b"] = cddf["a"].astype("category")
pdf = df.to_pandas()
cddf = dgd.from_cudf(df, 1)
cddf["b"] = cddf["a"].astype("category")

ddf = dd.from_pandas(pdf, 1)
ddf["b"] = ddf["a"].astype("category")
ddf = dd.from_pandas(pdf, 1)
ddf["b"] = ddf["a"].astype("category")

assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])
assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"])

with pytest.raises(NotImplementedError):
cddf["b"].cat.categories
with pytest.raises(NotImplementedError):
cddf["b"].cat.categories

with pytest.raises(NotImplementedError):
ddf["b"].cat.categories
with pytest.raises(NotImplementedError):
ddf["b"].cat.categories

cddf = cddf.categorize()
ddf = ddf.categorize()
cddf = cddf.categorize()
ddf = ddf.categorize()

assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories)
assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)


@pytest.mark.parametrize("data", [data_cat_1()])
Expand Down
26 changes: 20 additions & 6 deletions python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,14 +782,16 @@ def test_dataframe_set_index():
df["str"] = list("abcdefghijklmnopqrstuvwxyz")
pdf = df.to_pandas()

ddf = dgd.from_cudf(df, npartitions=4)
ddf = ddf.set_index("str")
with dask.config.set({"dataframe.convert-string": False}):
ddf = dgd.from_cudf(df, npartitions=4)
ddf = ddf.set_index("str")

pddf = dd.from_pandas(pdf, npartitions=4)
pddf = pddf.set_index("str")
from cudf.testing._utils import assert_eq
pddf = dd.from_pandas(pdf, npartitions=4)
pddf = pddf.set_index("str")

from cudf.testing._utils import assert_eq

assert_eq(ddf.compute(), pddf.compute())
assert_eq(ddf.compute(), pddf.compute())


def test_series_describe():
Expand Down Expand Up @@ -938,3 +940,15 @@ def test_categorical_dtype_round_trip():
actual = ds.compute()
expected = pds.compute()
assert actual.dtype.ordered == expected.dtype.ordered


def test_object_to_string_fail(request):
request.applymarker(
pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/14915",
)
)
s = cudf.Series(["a", "b", "c"] * 10)
ds = dgd.from_cudf(s, npartitions=2)
pds = dd.from_pandas(s.to_pandas(), npartitions=2)
dd.assert_eq(ds.sort_values(), pds.sort_values())
3 changes: 2 additions & 1 deletion python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,8 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
if as_index:
# Groupby columns became the index.
# Sorting the index should not change anything.
dd.assert_eq(gf.index, gf.sort_index().index)
with dask.config.set({"dataframe.convert-string": False}):
dd.assert_eq(gf.index, gf.sort_index().index)
else:
# Groupby columns are did NOT become the index.
# Sorting by these columns should not change anything.
Expand Down
28 changes: 15 additions & 13 deletions python/dask_cudf/dask_cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import pytest

import dask
from dask import dataframe as dd

import cudf
Expand Down Expand Up @@ -69,16 +70,17 @@ def test_rowwise_reductions(data, op):
gddf = dgd.from_cudf(data, npartitions=10)
pddf = gddf.to_dask_dataframe()

if op in ("var", "std"):
expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0)
got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0)
else:
expected = getattr(pddf, op)(numeric_only=True, axis=1)
got = getattr(pddf, op)(numeric_only=True, axis=1)

dd.assert_eq(
expected,
got,
check_exact=False,
check_dtype=op not in ("var", "std"),
)
with dask.config.set({"dataframe.convert-string": False}):
if op in ("var", "std"):
expected = getattr(pddf, op)(axis=1, numeric_only=True, ddof=0)
got = getattr(gddf, op)(axis=1, numeric_only=True, ddof=0)
else:
expected = getattr(pddf, op)(numeric_only=True, axis=1)
got = getattr(pddf, op)(numeric_only=True, axis=1)

dd.assert_eq(
expected,
got,
check_exact=False,
check_dtype=op not in ("var", "std"),
)

0 comments on commit 9fa9dc5

Please sign in to comment.