Skip to content

Commit

Permalink
Fix slowdown in CategoricalIndex.__repr__ (rapidsai#16665)
Browse files Browse the repository at this point in the history
Fixes: rapidsai#13297 

This PR fixes a slow-down in performing repr of a `CategoricalIndex` when there are too many unique values. There was no other choice to fix this in a better way by using public APIs, because all the public APIs seem to be performing categories validation even if `fastpath=True`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: rapidsai#16665
  • Loading branch information
galipremsagar authored Aug 28, 2024
1 parent c600a65 commit 872e01e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 1 deletion.
16 changes: 15 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,21 @@ def __repr__(self):
output[:break_idx].replace("'", "") + output[break_idx:]
)
else:
output = repr(preprocess.to_pandas())
# Too many non-unique categories will cause
# the output to take too long. In this case, we
# split the categories into data and categories
# and generate the repr separately and
# merge them.
pd_cats = pd.Categorical(
preprocess.astype(preprocess.categories.dtype).to_pandas()
)
pd_preprocess = pd.CategoricalIndex(pd_cats)
data_repr = repr(pd_preprocess).split("\n")
pd_preprocess.dtype._categories = (
preprocess.categories.to_pandas()
)
cats_repr = repr(pd_preprocess).split("\n")
output = "\n".join(data_repr[:-1] + cats_repr[-1:])

output = output.replace("nan", str(cudf.NA))
elif preprocess._values.nullable:
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import itertools
import signal
import string
from collections import abc
from contextlib import contextmanager
Expand Down Expand Up @@ -368,3 +369,23 @@ def sv_to_udf_str_testing_lowering(context, builder, sig, args):
return cast_string_view_to_udf_string(
context, builder, sig.args[0], sig.return_type, args[0]
)


class cudf_timeout:
"""
Context manager to raise a TimeoutError after a specified number of seconds.
"""

def __init__(self, seconds, *, timeout_message=""):
self.seconds = int(seconds)
self.timeout_message = timeout_message

def _timeout_handler(self, signum, frame):
raise TimeoutError(self.timeout_message)

def __enter__(self):
signal.signal(signal.SIGALRM, self._timeout_handler)
signal.alarm(self.seconds)

def __exit__(self, type, value, traceback):
signal.alarm(0)
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1480,3 +1480,14 @@ def test_interval_index_repr():
gi = cudf.from_pandas(pi)

assert repr(pi) == repr(gi)


def test_large_unique_categories_repr():
# Unfortunately, this is a long running test (takes about 1 minute)
# and there is no way we can reduce the time
pi = pd.CategoricalIndex(range(100_000_000))
gi = cudf.CategoricalIndex(range(100_000_000))
expected_repr = repr(pi)
with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
actual_repr = repr(gi)
assert expected_repr == actual_repr

0 comments on commit 872e01e

Please sign in to comment.