Skip to content

Commit

Permalink
Remove index name overrides in dask-cudf pyarrow table dispatch (#15514)
Browse files Browse the repository at this point in the history
Looks like these overrides should be safe to remove now that #14159 is closed out.

This should unblock the GPU CI failures we're seeing on Dask with 24.06 in dask/dask#11045.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15514
  • Loading branch information
charlesbluca authored Apr 22, 2024
1 parent 13af975 commit 475f5e5
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 39 deletions.
46 changes: 30 additions & 16 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5450,9 +5450,11 @@ def from_arrow(cls, table):
"""
index_col = None
col_index_names = None
physical_column_md = []
if isinstance(table, pa.Table) and isinstance(
table.schema.pandas_metadata, dict
):
physical_column_md = table.schema.pandas_metadata["columns"]
index_col = table.schema.pandas_metadata["index_columns"]
if "column_indexes" in table.schema.pandas_metadata:
col_index_names = []
Expand Down Expand Up @@ -5480,7 +5482,18 @@ def from_arrow(cls, table):
# https://github.com/apache/arrow/issues/15178
out = out.set_index(idx)
else:
out = out.set_index(index_col[0])
out = out.set_index(index_col)

if (
"__index_level_0__" in out.index.names
and len(out.index.names) == 1
):
real_index_name = None
for md in physical_column_md:
if md["field_name"] == "__index_level_0__":
real_index_name = md["name"]
break
out.index.name = real_index_name

return out

Expand Down Expand Up @@ -5530,42 +5543,43 @@ def to_arrow(self, preserve_index=None):
write_index = preserve_index is not False
keep_range_index = write_index and preserve_index is None
index = self.index
index_levels = [self.index]
if write_index:
if isinstance(index, cudf.RangeIndex) and keep_range_index:
descr = {
"kind": "range",
"name": index.name,
"start": index._start,
"stop": index._stop,
"step": 1,
}
index_descr = [
{
"kind": "range",
"name": index.name,
"start": index._start,
"stop": index._stop,
"step": 1,
}
]
else:
if isinstance(index, cudf.RangeIndex):
index = index._as_int_index()
index.name = "__index_level_0__"
if isinstance(index, MultiIndex):
gen_names = tuple(
f"level_{i}" for i, _ in enumerate(index._data.names)
)
index_descr = list(index._data.names)
index_levels = index.levels
else:
gen_names = (
index_descr = (
index.names if index.name is not None else ("index",)
)
for gen_name, col_name in zip(gen_names, index._data.names):
for gen_name, col_name in zip(index_descr, index._data.names):
data._insert(
data.shape[1],
gen_name,
index._data[col_name],
)
descr = gen_names[0]
index_descr.append(descr)

out = super(DataFrame, data).to_arrow()
# import pdb; pdb.set_trace()
metadata = pa.pandas_compat.construct_metadata(
columns_to_convert=[self[col] for col in self._data.names],
df=self,
column_names=out.schema.names,
index_levels=[index],
index_levels=index_levels,
index_descriptors=index_descr,
preserve_index=preserve_index,
types=out.schema.types,
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2769,6 +2769,28 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
assert_eq(pdf2, gdf2)


@pytest.mark.parametrize(
"index", [None, cudf.RangeIndex(3, name="a"), "a", "b", ["a", "b"]]
)
@pytest.mark.parametrize("preserve_index", [True, False, None])
def test_arrow_round_trip(preserve_index, index):
data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]}
if isinstance(index, (list, str)):
gdf = cudf.DataFrame(data).set_index(index)
else:
gdf = cudf.DataFrame(data, index=index)

table = gdf.to_arrow(preserve_index=preserve_index)
table_pd = pa.Table.from_pandas(
gdf.to_pandas(), preserve_index=preserve_index
)

gdf_out = cudf.DataFrame.from_arrow(table)
pdf_out = table_pd.to_pandas()

assert_eq(gdf_out, pdf_out)


@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
def test_cuda_array_interface(dtype):
np_data = np.arange(10).astype(dtype)
Expand Down
22 changes: 1 addition & 21 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,18 +384,6 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs):
"Ignoring the following arguments to "
f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
)

# TODO: Remove this logic when cudf#14159 is resolved
# (see: https://github.com/rapidsai/cudf/issues/14159)
if preserve_index and isinstance(obj.index, cudf.RangeIndex):
obj = obj.copy()
obj.index.name = (
obj.index.name
if obj.index.name is not None
else "__index_level_0__"
)
obj.index = obj.index._as_int_index()

return obj.to_arrow(preserve_index=preserve_index)


Expand All @@ -408,15 +396,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
f"Ignoring the following arguments to "
f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
)
result = obj.from_arrow(table)

# TODO: Remove this logic when cudf#14159 is resolved
# (see: https://github.com/rapidsai/cudf/issues/14159)
if "__index_level_0__" in result.index.names:
assert len(result.index.names) == 1
result.index.name = None

return result
return obj.from_arrow(table)


@union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
Expand Down
11 changes: 9 additions & 2 deletions python/dask_cudf/dask_cudf/tests/test_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,24 @@ def test_is_categorical_dispatch():


@pytest.mark.parametrize("preserve_index", [True, False])
def test_pyarrow_conversion_dispatch(preserve_index):
@pytest.mark.parametrize("index", [None, cudf.RangeIndex(10, name="foo")])
def test_pyarrow_conversion_dispatch(preserve_index, index):
from dask.dataframe.dispatch import (
from_pyarrow_table_dispatch,
to_pyarrow_table_dispatch,
)

df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
df1 = cudf.DataFrame(
np.random.randn(10, 3), columns=list("abc"), index=index
)
df2 = from_pyarrow_table_dispatch(
df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
)

# preserve_index=False doesn't retain index metadata
if not preserve_index and index is not None:
df1.index.name = None

assert type(df1) == type(df2)
assert_eq(df1, df2)

Expand Down

0 comments on commit 475f5e5

Please sign in to comment.