From 7660af03010e73905e55680a7751d0225925123b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 23 May 2023 17:27:08 -0400 Subject: [PATCH] Fix tokenize with non-space delimiter (#13403) Closes https://github.com/rapidsai/cudf/issues/13399 Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/13403 --- python/cudf/cudf/core/column/string.py | 5 ++++- python/cudf/cudf/tests/test_text.py | 29 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a3163f1cebe..9319881669f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: ) if isinstance(self._parent, cudf.Series): result.index = self._parent.index.repeat( # type: ignore - self.token_count() + self.token_count(delimiter=delimiter) ) return result @@ -5296,6 +5296,9 @@ def minhash( def _massage_string_arg(value, name, allow_col=False): + if isinstance(value, cudf.Scalar): + return value + if isinstance(value, str): return cudf.Scalar(value, dtype="str") diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 899248513de..f0e0e52142f 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -52,6 +52,35 @@ def test_tokenize(): assert_eq(expected, actual) +def test_tokenize_delimiter(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected_values = cudf.Series( + [ + "the quick f", + "x jumped ", + "ver the lazy d", + "g", + "the siamésé cat jumped under the s", + "fa", + ] + ) + expected_index = strings.index.repeat(strings.str.token_count("o")) + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.tokenize(delimiter="o") + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + def test_detokenize(): strings = cudf.Series( [