Enforce deprecations in 23.10 (rapidsai#13732)

This PR enforces previously deprecated code until `23.08` in `23.10`. This PR removes `strings_to_categorical` parameter support in `read_parquet`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Richard (Rick) Zamora (https://github.com/rjzamora) - Bradley Dice (https://github.com/bdice) URL: rapidsai#13732
madsbk · Jul 24, 2023 · 2a590db · 2a590db
1 parent 0edea00
commit 2a590db
Show file tree

Hide file tree

Showing 7 changed files with 4 additions and 71 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -19,14 +19,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
         data_type get_timestamp_type() except +
-        bool is_enabled_convert_strings_to_categories() except +
         bool is_enabled_use_pandas_metadata() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
-        void enable_convert_strings_to_categories(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -46,9 +44,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
-        parquet_reader_options_builder& convert_strings_to_categories(
-            bool val
-        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -120,7 +120,6 @@ def _parse_metadata(meta):
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   strings_to_categorical=False,
                    use_pandas_metadata=True):
     """
     Cython function to call into libcudf API, see `read_parquet`.
@@ -144,7 +143,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cdef cudf_io_types.source_info source = make_source_info(
         filepaths_or_buffers)
 
-    cdef bool cpp_strings_to_categorical = strings_to_categorical
     cdef bool cpp_use_pandas_metadata = use_pandas_metadata
 
     cdef vector[vector[size_type]] cpp_row_groups
@@ -160,7 +158,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     args = move(
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
-        .convert_strings_to_categories(cpp_strings_to_categorical)
         .use_pandas_metadata(cpp_use_pandas_metadata)
         .timestamp_type(cpp_timestamp_type)
         .build()

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -437,7 +437,6 @@ def read_parquet(
     storage_options=None,
     filters=None,
     row_groups=None,
-    strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=True,
     categorical_partitions=True,
@@ -449,12 +448,6 @@ def read_parquet(
 ):
     """{docstring}"""
 
-    if strings_to_categorical is not False:
-        warnings.warn(
-            "`strings_to_categorical` is deprecated and will be removed in "
-            "a future version of cudf.",
-            FutureWarning,
-        )
     # Do not allow the user to set file-opening options
     # when `use_python_file_object=False` is specified
     if use_python_file_object is False:
@@ -578,7 +571,6 @@ def read_parquet(
         *args,
         columns=columns,
         row_groups=row_groups,
-        strings_to_categorical=strings_to_categorical,
         use_pandas_metadata=use_pandas_metadata,
         partition_keys=partition_keys,
         partition_categories=partition_categories,
@@ -809,7 +801,6 @@ def _read_parquet(
     engine,
     columns=None,
     row_groups=None,
-    strings_to_categorical=None,
     use_pandas_metadata=None,
     *args,
     **kwargs,
@@ -831,7 +822,6 @@ def _read_parquet(
             filepaths_or_buffers,
             columns=columns,
             row_groups=row_groups,
-            strings_to_categorical=strings_to_categorical,
             use_pandas_metadata=use_pandas_metadata,
         )
     else:

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -31,7 +31,6 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    expect_warning_if,
     set_random_null_mask_inplace,
 )
 
@@ -298,8 +297,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine):
 
 
 @pytest.mark.parametrize("has_null", [False, True])
-@pytest.mark.parametrize("strings_to_categorical", [False, True, None])
-def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
+def test_parquet_reader_strings(tmpdir, has_null):
     df = pd.DataFrame(
         [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)],
         columns=pd.Index(list("abc")),
@@ -310,28 +308,10 @@ def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
     df.to_parquet(fname)
     assert os.path.exists(fname)
 
-    if strings_to_categorical is not None:
-        with expect_warning_if(strings_to_categorical is not False):
-            gdf = cudf.read_parquet(
-                fname,
-                engine="cudf",
-                strings_to_categorical=strings_to_categorical,
-            )
-    else:
-        gdf = cudf.read_parquet(fname, engine="cudf")
+    gdf = cudf.read_parquet(fname, engine="cudf")
 
-    if strings_to_categorical:
-        if has_null:
-            hash_ref = [989983842, None, 1169108191]
-        else:
-            hash_ref = [989983842, 429364346, 1169108191]
-        assert gdf["b"].dtype == np.dtype("int32")
-        assert_eq(
-            gdf["b"], cudf.Series(hash_ref, dtype=np.dtype("int32"), name="b")
-        )
-    else:
-        assert gdf["b"].dtype == np.dtype("object")
-        assert_eq(gdf["b"], df["b"])
+    assert gdf["b"].dtype == np.dtype("object")
+    assert_eq(gdf["b"], df["b"])
 
 
 @pytest.mark.parametrize("columns", [None, ["b"]])

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -166,14 +166,6 @@
     If not None, specifies, for each input file, which row groups to read.
     If reading multiple inputs, a list of lists should be passed, one list
     for each input.
-strings_to_categorical : boolean, default False
-    If True, return string columns as GDF_CATEGORY dtype; if False, return a
-    as GDF_STRING dtype.
-
-    .. deprecated:: 23.08
-
-        This parameter is deprecated and will be removed in a future
-        version of cudf.
 categorical_partitions : boolean, default True
     Whether directory-partitioned columns should be interpreted as categorical
     or raw dtypes.

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -50,15 +50,6 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
             kwargs.get("schema", None),
         )
 
-        # If `strings_to_categorical==True`, convert objects to int32
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
-        for col in meta_cudf._data.names:
-            if (
-                isinstance(meta_cudf._data[col], cudf.core.column.StringColumn)
-                and strings_to_cats
-            ):
-                meta_cudf._data[col] = meta_cudf._data[col].astype("int32")
-
         return meta_cudf
 
     @classmethod
@@ -75,7 +66,6 @@ def _read_paths(
         columns=None,
         row_groups=None,
         filters=None,
-        strings_to_categorical=None,
         partitions=None,
         partitioning=None,
         partition_keys=None,
@@ -124,7 +114,6 @@ def _read_paths(
                     engine="cudf",
                     columns=columns,
                     row_groups=row_groups if row_groups else None,
-                    strings_to_categorical=strings_to_categorical,
                     dataset_kwargs=dataset_kwargs,
                     categorical_partitions=False,
                     **kwargs,
@@ -142,7 +131,6 @@ def _read_paths(
                                 row_groups=row_groups[i]
                                 if row_groups
                                 else None,
-                                strings_to_categorical=strings_to_categorical,
                                 dataset_kwargs=dataset_kwargs,
                                 categorical_partitions=False,
                                 **kwargs,
@@ -245,7 +233,6 @@ def read_partition(
             pieces = [pieces]
 
         # Extract supported kwargs from `kwargs`
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
         read_kwargs = kwargs.get("read", {})
         read_kwargs.update(open_file_options or {})
         check_file_size = read_kwargs.pop("check_file_size", None)
@@ -291,7 +278,6 @@ def read_partition(
                             columns=read_columns,
                             row_groups=rgs if rgs else None,
                             filters=filters,
-                            strings_to_categorical=strings_to_cats,
                             partitions=partitions,
                             partitioning=partitioning,
                             partition_keys=last_partition_keys,
@@ -318,7 +304,6 @@ def read_partition(
                     columns=read_columns,
                     row_groups=rgs if rgs else None,
                     filters=filters,
-                    strings_to_categorical=strings_to_cats,
                     partitions=partitions,
                     partitioning=partitioning,
                     partition_keys=last_partition_keys,

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -159,12 +159,6 @@ def test_strings(tmpdir):
     read_df = dask_cudf.read_parquet(fn, index=["a"])
     dd.assert_eq(ddf2, read_df.compute().to_pandas())
 
-    read_df_cats = dask_cudf.read_parquet(
-        fn, index=["a"], strings_to_categorical=True
-    )
-    dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes)
-    dd.assert_eq(read_df_cats.dtypes[0], "int32")
-
 
 def test_dask_timeseries_from_pandas(tmpdir):