diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b561906afb..d60c206ac24 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self: # so handling indices and dictionary as two different columns. # This needs be removed once we have hooked libcudf dictionary32 # with categorical. + if any( + isinstance(x.type, pa.DictionaryType) + and isinstance(x, pa.ChunkedArray) + for x in data + ): + data = data.combine_chunks() + dict_indices = {} dict_dictionaries = {} dict_ordered = {} diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8b18e53d320..d76d5eb8065 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type): np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) +def test_from_arrow_chunked_categories(): + # Verify that categories are properly deduplicated across chunked arrays. + indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pa.array(["foo", "bar", "baz"]) + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + chunked_array = pa.chunked_array([dict_array, dict_array]) + table = pa.table({"a": chunked_array}) + df = cudf.DataFrame.from_arrow(table) + final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() + assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) + + @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type):