Skip to content

Commit

Permalink
Fix categorical conversion from chunked arrow arrays (#15886)
Browse files Browse the repository at this point in the history
The current logic for converting arrow dictionary arrays to cudf doesn't properly uniquify categories across chunks of chunked arrays. This PR implements the simplest fix by having arrow combine chunks when this case is encountered.

Resolves #6828

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15886
  • Loading branch information
vyasr authored May 30, 2024
1 parent 579a167 commit bab0d80
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self:
# so handling indices and dictionary as two different columns.
# This needs be removed once we have hooked libcudf dictionary32
# with categorical.
if any(
isinstance(x.type, pa.DictionaryType)
and isinstance(x, pa.ChunkedArray)
for x in data
):
data = data.combine_chunks()

dict_indices = {}
dict_dictionaries = {}
dict_ordered = {}
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type):
np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy())


def test_from_arrow_chunked_categories():
# Verify that categories are properly deduplicated across chunked arrays.
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(["foo", "bar", "baz"])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
chunked_array = pa.chunked_array([dict_array, dict_array])
table = pa.table({"a": chunked_array})
df = cudf.DataFrame.from_arrow(table)
final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist()
assert sorted(final_dictionary) == sorted(dictionary.to_pylist())


@pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
@pytest.mark.parametrize("data_type", dtypes)
def test_to_arrow(nelem, data_type):
Expand Down

0 comments on commit bab0d80

Please sign in to comment.