From 2a590dbb6a06eb59bdfa97976dd5b22635b6c1f9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 24 Jul 2023 15:43:45 -0500 Subject: [PATCH] Enforce deprecations in `23.10` (#13732) This PR enforces previously deprecated code until `23.08` in `23.10`. This PR removes `strings_to_categorical` parameter support in `read_parquet`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Richard (Rick) Zamora (https://github.com/rjzamora) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13732 --- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 5 ---- python/cudf/cudf/_lib/parquet.pyx | 3 -- python/cudf/cudf/io/parquet.py | 10 ------- python/cudf/cudf/tests/test_parquet.py | 28 +++---------------- python/cudf/cudf/utils/ioutils.py | 8 ------ python/dask_cudf/dask_cudf/io/parquet.py | 15 ---------- .../dask_cudf/io/tests/test_parquet.py | 6 ---- 7 files changed, 4 insertions(+), 71 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index e2570eaa7d9..f6fa04b9c29 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -19,14 +19,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.source_info get_source_info() except + vector[vector[size_type]] get_row_groups() except + data_type get_timestamp_type() except + - bool is_enabled_convert_strings_to_categories() except + bool is_enabled_use_pandas_metadata() except + # setter void set_columns(vector[string] col_names) except + void set_row_groups(vector[vector[size_type]] row_grp) except + - void enable_convert_strings_to_categories(bool val) except + void enable_use_pandas_metadata(bool val) except + void set_timestamp_type(data_type type) except + @@ -46,9 +44,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_reader_options_builder& row_groups( vector[vector[size_type]] row_grp ) except + - parquet_reader_options_builder& convert_strings_to_categories( - bool val - ) except + parquet_reader_options_builder& use_pandas_metadata( bool val ) except + diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 2c7f5df084b..7c861203d6c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -120,7 +120,6 @@ def _parse_metadata(meta): cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - strings_to_categorical=False, use_pandas_metadata=True): """ Cython function to call into libcudf API, see `read_parquet`. @@ -144,7 +143,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cdef cudf_io_types.source_info source = make_source_info( filepaths_or_buffers) - cdef bool cpp_strings_to_categorical = strings_to_categorical cdef bool cpp_use_pandas_metadata = use_pandas_metadata cdef vector[vector[size_type]] cpp_row_groups @@ -160,7 +158,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, args = move( parquet_reader_options.builder(source) .row_groups(cpp_row_groups) - .convert_strings_to_categories(cpp_strings_to_categorical) .use_pandas_metadata(cpp_use_pandas_metadata) .timestamp_type(cpp_timestamp_type) .build() diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 0dec8e1c67f..d8510cf8e95 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -437,7 +437,6 @@ def read_parquet( storage_options=None, filters=None, row_groups=None, - strings_to_categorical=False, use_pandas_metadata=True, use_python_file_object=True, categorical_partitions=True, @@ -449,12 +448,6 @@ def read_parquet( ): """{docstring}""" - if strings_to_categorical is not False: - warnings.warn( - "`strings_to_categorical` is deprecated and will be removed in " - "a future version of cudf.", - FutureWarning, - ) # Do not allow the user to set file-opening options # when `use_python_file_object=False` is specified if use_python_file_object is False: @@ -578,7 +571,6 @@ def read_parquet( *args, columns=columns, row_groups=row_groups, - strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, partition_keys=partition_keys, partition_categories=partition_categories, @@ -809,7 +801,6 @@ def _read_parquet( engine, columns=None, row_groups=None, - strings_to_categorical=None, use_pandas_metadata=None, *args, **kwargs, @@ -831,7 +822,6 @@ def _read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, - strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index cdece1397c3..f403c522f58 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -31,7 +31,6 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, - expect_warning_if, set_random_null_mask_inplace, ) @@ -298,8 +297,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine): @pytest.mark.parametrize("has_null", [False, True]) -@pytest.mark.parametrize("strings_to_categorical", [False, True, None]) -def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null): +def test_parquet_reader_strings(tmpdir, has_null): df = pd.DataFrame( [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)], columns=pd.Index(list("abc")), @@ -310,28 +308,10 @@ def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null): df.to_parquet(fname) assert os.path.exists(fname) - if strings_to_categorical is not None: - with expect_warning_if(strings_to_categorical is not False): - gdf = cudf.read_parquet( - fname, - engine="cudf", - strings_to_categorical=strings_to_categorical, - ) - else: - gdf = cudf.read_parquet(fname, engine="cudf") + gdf = cudf.read_parquet(fname, engine="cudf") - if strings_to_categorical: - if has_null: - hash_ref = [989983842, None, 1169108191] - else: - hash_ref = [989983842, 429364346, 1169108191] - assert gdf["b"].dtype == np.dtype("int32") - assert_eq( - gdf["b"], cudf.Series(hash_ref, dtype=np.dtype("int32"), name="b") - ) - else: - assert gdf["b"].dtype == np.dtype("object") - assert_eq(gdf["b"], df["b"]) + assert gdf["b"].dtype == np.dtype("object") + assert_eq(gdf["b"], df["b"]) @pytest.mark.parametrize("columns", [None, ["b"]]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index fb8492bbf4f..91925bf3c0c 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -166,14 +166,6 @@ If not None, specifies, for each input file, which row groups to read. If reading multiple inputs, a list of lists should be passed, one list for each input. -strings_to_categorical : boolean, default False - If True, return string columns as GDF_CATEGORY dtype; if False, return a - as GDF_STRING dtype. - - .. deprecated:: 23.08 - - This parameter is deprecated and will be removed in a future - version of cudf. categorical_partitions : boolean, default True Whether directory-partitioned columns should be interpreted as categorical or raw dtypes. diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 65d9fee3a8a..dd8c3394a2c 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -50,15 +50,6 @@ def _create_dd_meta(cls, dataset_info, **kwargs): kwargs.get("schema", None), ) - # If `strings_to_categorical==True`, convert objects to int32 - strings_to_cats = kwargs.get("strings_to_categorical", False) - for col in meta_cudf._data.names: - if ( - isinstance(meta_cudf._data[col], cudf.core.column.StringColumn) - and strings_to_cats - ): - meta_cudf._data[col] = meta_cudf._data[col].astype("int32") - return meta_cudf @classmethod @@ -75,7 +66,6 @@ def _read_paths( columns=None, row_groups=None, filters=None, - strings_to_categorical=None, partitions=None, partitioning=None, partition_keys=None, @@ -124,7 +114,6 @@ def _read_paths( engine="cudf", columns=columns, row_groups=row_groups if row_groups else None, - strings_to_categorical=strings_to_categorical, dataset_kwargs=dataset_kwargs, categorical_partitions=False, **kwargs, @@ -142,7 +131,6 @@ def _read_paths( row_groups=row_groups[i] if row_groups else None, - strings_to_categorical=strings_to_categorical, dataset_kwargs=dataset_kwargs, categorical_partitions=False, **kwargs, @@ -245,7 +233,6 @@ def read_partition( pieces = [pieces] # Extract supported kwargs from `kwargs` - strings_to_cats = kwargs.get("strings_to_categorical", False) read_kwargs = kwargs.get("read", {}) read_kwargs.update(open_file_options or {}) check_file_size = read_kwargs.pop("check_file_size", None) @@ -291,7 +278,6 @@ def read_partition( columns=read_columns, row_groups=rgs if rgs else None, filters=filters, - strings_to_categorical=strings_to_cats, partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, @@ -318,7 +304,6 @@ def read_partition( columns=read_columns, row_groups=rgs if rgs else None, filters=filters, - strings_to_categorical=strings_to_cats, partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 7b9fac665c6..85ec36cf2c5 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -159,12 +159,6 @@ def test_strings(tmpdir): read_df = dask_cudf.read_parquet(fn, index=["a"]) dd.assert_eq(ddf2, read_df.compute().to_pandas()) - read_df_cats = dask_cudf.read_parquet( - fn, index=["a"], strings_to_categorical=True - ) - dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes) - dd.assert_eq(read_df_cats.dtypes[0], "int32") - def test_dask_timeseries_from_pandas(tmpdir):