Skip to content

Commit

Permalink
Enforce deprecations in 23.10 (rapidsai#13732)
Browse files Browse the repository at this point in the history
This PR enforces previously deprecated code until `23.08` in `23.10`. This PR removes `strings_to_categorical` parameter support in `read_parquet`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai#13732
  • Loading branch information
galipremsagar authored Jul 24, 2023
1 parent 0edea00 commit 2a590db
Show file tree
Hide file tree
Showing 7 changed files with 4 additions and 71 deletions.
5 changes: 0 additions & 5 deletions python/cudf/cudf/_lib/cpp/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
cudf_io_types.source_info get_source_info() except +
vector[vector[size_type]] get_row_groups() except +
data_type get_timestamp_type() except +
bool is_enabled_convert_strings_to_categories() except +
bool is_enabled_use_pandas_metadata() except +

# setter

void set_columns(vector[string] col_names) except +
void set_row_groups(vector[vector[size_type]] row_grp) except +
void enable_convert_strings_to_categories(bool val) except +
void enable_use_pandas_metadata(bool val) except +
void set_timestamp_type(data_type type) except +

Expand All @@ -46,9 +44,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
parquet_reader_options_builder& row_groups(
vector[vector[size_type]] row_grp
) except +
parquet_reader_options_builder& convert_strings_to_categories(
bool val
) except +
parquet_reader_options_builder& use_pandas_metadata(
bool val
) except +
Expand Down
3 changes: 0 additions & 3 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def _parse_metadata(meta):


cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
strings_to_categorical=False,
use_pandas_metadata=True):
"""
Cython function to call into libcudf API, see `read_parquet`.
Expand All @@ -144,7 +143,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
cdef cudf_io_types.source_info source = make_source_info(
filepaths_or_buffers)

cdef bool cpp_strings_to_categorical = strings_to_categorical
cdef bool cpp_use_pandas_metadata = use_pandas_metadata

cdef vector[vector[size_type]] cpp_row_groups
Expand All @@ -160,7 +158,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
args = move(
parquet_reader_options.builder(source)
.row_groups(cpp_row_groups)
.convert_strings_to_categories(cpp_strings_to_categorical)
.use_pandas_metadata(cpp_use_pandas_metadata)
.timestamp_type(cpp_timestamp_type)
.build()
Expand Down
10 changes: 0 additions & 10 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,6 @@ def read_parquet(
storage_options=None,
filters=None,
row_groups=None,
strings_to_categorical=False,
use_pandas_metadata=True,
use_python_file_object=True,
categorical_partitions=True,
Expand All @@ -449,12 +448,6 @@ def read_parquet(
):
"""{docstring}"""

if strings_to_categorical is not False:
warnings.warn(
"`strings_to_categorical` is deprecated and will be removed in "
"a future version of cudf.",
FutureWarning,
)
# Do not allow the user to set file-opening options
# when `use_python_file_object=False` is specified
if use_python_file_object is False:
Expand Down Expand Up @@ -578,7 +571,6 @@ def read_parquet(
*args,
columns=columns,
row_groups=row_groups,
strings_to_categorical=strings_to_categorical,
use_pandas_metadata=use_pandas_metadata,
partition_keys=partition_keys,
partition_categories=partition_categories,
Expand Down Expand Up @@ -809,7 +801,6 @@ def _read_parquet(
engine,
columns=None,
row_groups=None,
strings_to_categorical=None,
use_pandas_metadata=None,
*args,
**kwargs,
Expand All @@ -831,7 +822,6 @@ def _read_parquet(
filepaths_or_buffers,
columns=columns,
row_groups=row_groups,
strings_to_categorical=strings_to_categorical,
use_pandas_metadata=use_pandas_metadata,
)
else:
Expand Down
28 changes: 4 additions & 24 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
TIMEDELTA_TYPES,
assert_eq,
assert_exceptions_equal,
expect_warning_if,
set_random_null_mask_inplace,
)

Expand Down Expand Up @@ -298,8 +297,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine):


@pytest.mark.parametrize("has_null", [False, True])
@pytest.mark.parametrize("strings_to_categorical", [False, True, None])
def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
def test_parquet_reader_strings(tmpdir, has_null):
df = pd.DataFrame(
[(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)],
columns=pd.Index(list("abc")),
Expand All @@ -310,28 +308,10 @@ def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
df.to_parquet(fname)
assert os.path.exists(fname)

if strings_to_categorical is not None:
with expect_warning_if(strings_to_categorical is not False):
gdf = cudf.read_parquet(
fname,
engine="cudf",
strings_to_categorical=strings_to_categorical,
)
else:
gdf = cudf.read_parquet(fname, engine="cudf")
gdf = cudf.read_parquet(fname, engine="cudf")

if strings_to_categorical:
if has_null:
hash_ref = [989983842, None, 1169108191]
else:
hash_ref = [989983842, 429364346, 1169108191]
assert gdf["b"].dtype == np.dtype("int32")
assert_eq(
gdf["b"], cudf.Series(hash_ref, dtype=np.dtype("int32"), name="b")
)
else:
assert gdf["b"].dtype == np.dtype("object")
assert_eq(gdf["b"], df["b"])
assert gdf["b"].dtype == np.dtype("object")
assert_eq(gdf["b"], df["b"])


@pytest.mark.parametrize("columns", [None, ["b"]])
Expand Down
8 changes: 0 additions & 8 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,6 @@
If not None, specifies, for each input file, which row groups to read.
If reading multiple inputs, a list of lists should be passed, one list
for each input.
strings_to_categorical : boolean, default False
If True, return string columns as GDF_CATEGORY dtype; if False, return a
as GDF_STRING dtype.
.. deprecated:: 23.08
This parameter is deprecated and will be removed in a future
version of cudf.
categorical_partitions : boolean, default True
Whether directory-partitioned columns should be interpreted as categorical
or raw dtypes.
Expand Down
15 changes: 0 additions & 15 deletions python/dask_cudf/dask_cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,6 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
kwargs.get("schema", None),
)

# If `strings_to_categorical==True`, convert objects to int32
strings_to_cats = kwargs.get("strings_to_categorical", False)
for col in meta_cudf._data.names:
if (
isinstance(meta_cudf._data[col], cudf.core.column.StringColumn)
and strings_to_cats
):
meta_cudf._data[col] = meta_cudf._data[col].astype("int32")

return meta_cudf

@classmethod
Expand All @@ -75,7 +66,6 @@ def _read_paths(
columns=None,
row_groups=None,
filters=None,
strings_to_categorical=None,
partitions=None,
partitioning=None,
partition_keys=None,
Expand Down Expand Up @@ -124,7 +114,6 @@ def _read_paths(
engine="cudf",
columns=columns,
row_groups=row_groups if row_groups else None,
strings_to_categorical=strings_to_categorical,
dataset_kwargs=dataset_kwargs,
categorical_partitions=False,
**kwargs,
Expand All @@ -142,7 +131,6 @@ def _read_paths(
row_groups=row_groups[i]
if row_groups
else None,
strings_to_categorical=strings_to_categorical,
dataset_kwargs=dataset_kwargs,
categorical_partitions=False,
**kwargs,
Expand Down Expand Up @@ -245,7 +233,6 @@ def read_partition(
pieces = [pieces]

# Extract supported kwargs from `kwargs`
strings_to_cats = kwargs.get("strings_to_categorical", False)
read_kwargs = kwargs.get("read", {})
read_kwargs.update(open_file_options or {})
check_file_size = read_kwargs.pop("check_file_size", None)
Expand Down Expand Up @@ -291,7 +278,6 @@ def read_partition(
columns=read_columns,
row_groups=rgs if rgs else None,
filters=filters,
strings_to_categorical=strings_to_cats,
partitions=partitions,
partitioning=partitioning,
partition_keys=last_partition_keys,
Expand All @@ -318,7 +304,6 @@ def read_partition(
columns=read_columns,
row_groups=rgs if rgs else None,
filters=filters,
strings_to_categorical=strings_to_cats,
partitions=partitions,
partitioning=partitioning,
partition_keys=last_partition_keys,
Expand Down
6 changes: 0 additions & 6 deletions python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,6 @@ def test_strings(tmpdir):
read_df = dask_cudf.read_parquet(fn, index=["a"])
dd.assert_eq(ddf2, read_df.compute().to_pandas())

read_df_cats = dask_cudf.read_parquet(
fn, index=["a"], strings_to_categorical=True
)
dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes)
dd.assert_eq(read_df_cats.dtypes[0], "int32")


def test_dask_timeseries_from_pandas(tmpdir):

Expand Down

0 comments on commit 2a590db

Please sign in to comment.