From f785ed3ddebf8b225b9d7c07aab9d5f32eb39b05 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 19:22:50 -1000 Subject: [PATCH] Use instance over is_foo_dtype (#14641) Similar to https://github.com/rapidsai/cudf/pull/14638, use isinstance when we know we are checking a dtype instance Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14641 --- python/cudf/cudf/_lib/column.pyx | 14 +++--- python/cudf/cudf/_lib/groupby.pyx | 74 +++++++++++++++++++----------- python/cudf/cudf/_lib/interop.pyx | 10 ++-- python/cudf/cudf/_lib/io/utils.pyx | 6 +-- python/cudf/cudf/_lib/json.pyx | 21 ++++----- python/cudf/cudf/_lib/orc.pyx | 7 ++- python/cudf/cudf/_lib/parquet.pyx | 21 +++------ python/cudf/cudf/_lib/scalar.pyx | 13 ++---- python/cudf/cudf/_lib/types.pyx | 34 +++++++------- python/cudf/cudf/_lib/utils.pyx | 24 ++++------ 10 files changed, 111 insertions(+), 113 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index acd0ba519dd..45aa1081b8d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -5,13 +5,13 @@ from typing import Literal import cupy as cp import numpy as np +import pandas as pd import rmm import cudf import cudf._lib as libcudf from cudf._lib import pylibcudf -from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -344,10 +344,10 @@ cdef class Column: ) cdef mutable_column_view mutable_view(self) except *: - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif isinstance(self.dtype, pd.DatetimeTZDtype): col = self data_dtype = _get_base_dtype(col.dtype) else: @@ -407,10 +407,10 @@ cdef class Column: return self._view(c_null_count) cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif isinstance(self.dtype, pd.DatetimeTZDtype): col = self data_dtype = _get_base_dtype(col.dtype) else: @@ -482,7 +482,7 @@ cdef class Column: # categoricals because cudf supports ordered and unordered categoricals # while libcudf supports only unordered categoricals (see # https://github.com/rapidsai/cudf/pull/8567). - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] else: col = self @@ -648,7 +648,7 @@ cdef class Column: """ column_owner = isinstance(owner, Column) mask_owner = owner - if column_owner and is_categorical_dtype(owner.dtype): + if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype): owner = owner.base_children[0] size = cv.size() diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index f332fead8d1..8848649736b 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,16 +1,17 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from functools import singledispatch from pandas.core.groupby.groupby import DataError -from cudf.api.types import ( - is_categorical_dtype, - is_decimal_dtype, - is_interval_dtype, - is_list_dtype, - is_string_dtype, - is_struct_dtype, -) +from cudf.api.types import is_string_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.core.dtypes import ( + CategoricalDtype, + DecimalDtype, + IntervalDtype, + ListDtype, + StructDtype, +) from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -73,6 +74,43 @@ _DECIMAL_AGGS = { ctypedef const scalar constscalar +@singledispatch +def get_valid_aggregation(dtype): + if is_string_dtype(dtype): + return _STRING_AGGS + return "ALL" + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: CategoricalDtype): + return _CATEGORICAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: StructDtype): + return _STRUCT_AGGS + + +@get_valid_aggregation.register +def _(dtype: IntervalDtype): + return _INTERVAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: DecimalDtype): + return _DECIMAL_AGGS + + cdef _agg_result_from_columns( vector[libcudf_groupby.aggregation_result]& c_result_columns, set column_included, @@ -187,15 +225,7 @@ cdef class GroupBy: for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype - valid_aggregations = ( - _LIST_AGGS if is_list_dtype(dtype) - else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) - else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) - else _DECIMAL_AGGS if is_decimal_dtype(dtype) - else "ALL" - ) + valid_aggregations = get_valid_aggregation(dtype) included_aggregations_i = [] c_agg_request = move(libcudf_groupby.aggregation_request()) @@ -258,15 +288,7 @@ cdef class GroupBy: for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype - valid_aggregations = ( - _LIST_AGGS if is_list_dtype(dtype) - else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) - else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) - else _DECIMAL_AGGS if is_decimal_dtype(dtype) - else "ALL" - ) + valid_aggregations = get_valid_aggregation(dtype) included_aggregations_i = [] c_agg_request = move(libcudf_groupby.scan_request()) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 8fd2a409d90..13c8ce43ea3 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr @@ -18,8 +18,8 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.core.dtypes import ListDtype, StructDtype def from_dlpack(dlpack_capsule): @@ -98,7 +98,7 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *: if cols_dtypes is not None: for idx, (col_name, col_dtype) in enumerate(cols_dtypes): cpp_metadata.push_back(column_metadata(col_name.encode())) - if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype): + if isinstance(col_dtype, (ListDtype, StructDtype)): _set_col_children_metadata(col_dtype, cpp_metadata[idx]) else: raise TypeError( @@ -113,14 +113,14 @@ cdef _set_col_children_metadata(dtype, cdef column_metadata element_metadata - if is_struct_dtype(dtype): + if isinstance(dtype, StructDtype): for name, value in dtype.fields.items(): element_metadata = column_metadata(name.encode()) _set_col_children_metadata( value, element_metadata ) col_meta.children_meta.push_back(element_metadata) - elif is_list_dtype(dtype): + elif isinstance(dtype, ListDtype): col_meta.children_meta.reserve(2) # Offsets - child 0 col_meta.children_meta.push_back(column_metadata()) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 9b027a4d275..ae978d18813 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory @@ -23,7 +23,7 @@ import errno import io import os -from cudf.api.types import is_struct_dtype +from cudf.core.dtypes import StructDtype # Converts the Python source input to libcudf IO source_info @@ -172,7 +172,7 @@ cdef Column update_column_struct_field_names( ) col.set_base_children(tuple(children)) - if is_struct_dtype(col): + if isinstance(col.dtype, StructDtype): field_names.reserve(len(col.base_children)) for i in range(info.children.size()): field_names.push_back(info.children[i].name) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 437c3ef6ec4..c361a3f00c4 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # cython: boundscheck = False @@ -17,6 +17,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types +from cudf._lib.column cimport Column from cudf._lib.cpp.io.data_sink cimport data_sink from cudf._lib.cpp.io.json cimport ( json_reader_options, @@ -42,10 +43,6 @@ from cudf._lib.io.utils cimport ( from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table -from cudf.api.types import is_list_dtype, is_struct_dtype - -from cudf._lib.column cimport Column - cpdef read_json(object filepaths_or_buffers, object dtype, @@ -214,13 +211,12 @@ def write_json( cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef schema_element s_element cdef data_type lib_type - if cudf.api.types.is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - - dtype = cudf.dtype(dtype) lib_type = dtype_to_data_type(dtype) s_element.type = lib_type if isinstance(dtype, cudf.StructDtype): @@ -237,19 +233,18 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - if cudf.api.types.is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - - dtype = cudf.dtype(dtype) return dtype_to_data_type(dtype) cdef _set_col_children_metadata(Column col, column_name_info& col_meta): cdef column_name_info child_info - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -258,7 +253,7 @@ cdef _set_col_children_metadata(Column col, _set_col_children_metadata( child_col, col_meta.children[i] ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): for i, child_col in enumerate(col.children): col_meta.children.push_back(child_info) _set_col_children_metadata( diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 0ae039b14d2..c64296eb7da 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cudf from cudf.core.buffer import acquire_spill_lock @@ -59,7 +59,6 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table from pyarrow.lib import NativeFile from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from cudf.api.types import is_list_dtype, is_struct_dtype cpdef read_raw_orc_statistics(filepath_or_buffer): @@ -474,7 +473,7 @@ cdef class ORCWriter: cdef _set_col_children_metadata(Column col, column_in_metadata& col_meta, list_column_as_map=False): - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -482,7 +481,7 @@ cdef _set_col_children_metadata(Column col, _set_col_children_metadata( child_col, col_meta.child(i), list_column_as_map ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): if list_column_as_map: col_meta.set_list_column_as_map() _set_col_children_metadata( diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 4acb1ce10b1..27efc5e1ecd 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # cython: boundscheck = False @@ -18,12 +18,7 @@ import numpy as np from cython.operator cimport dereference -from cudf.api.types import ( - is_decimal_dtype, - is_list_dtype, - is_list_like, - is_struct_dtype, -) +from cudf.api.types import is_list_like from cudf._lib.utils cimport data_from_unique_ptr @@ -220,7 +215,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, # update the decimal precision of each column for col in names: - if is_decimal_dtype(df._data[col].dtype): + if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): df._data[col].dtype.precision = ( meta_data_per_column[col]["metadata"]["precision"] ) @@ -703,7 +698,7 @@ cdef _set_col_metadata( # is true. col_meta.set_nullability(True) - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -713,13 +708,11 @@ cdef _set_col_metadata( col_meta.child(i), force_nullable_schema ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): _set_col_metadata( col.children[1], col_meta.child(1), force_nullable_schema ) - else: - if is_decimal_dtype(col): - col_meta.set_decimal_precision(col.dtype.precision) - return + elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): + col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 27fb9e994f0..37708a4e3ba 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy @@ -14,12 +14,7 @@ from libcpp.utility cimport move import cudf from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ( - ListDtype, - StructDtype, - is_list_dtype, - is_struct_dtype, -) +from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT cimport cudf._lib.cpp.types as libcudf_types @@ -79,9 +74,9 @@ def gather_metadata(dtypes): out = [] for name, dtype in dtypes.items(): v = pylibcudf.interop.ColumnMetadata(name) - if is_struct_dtype(dtype): + if isinstance(dtype, cudf.StructDtype): v.children_meta = gather_metadata(dtype.fields) - elif is_list_dtype(dtype): + elif isinstance(dtype, cudf.ListDtype): # Offsets column is unnamed and has no children v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) v.children_meta.extend( diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index d87104bf168..1b4f4617e97 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from enum import IntEnum @@ -238,15 +238,15 @@ cdef dtype_from_column_view(column_view cv): cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: cdef libcudf_types.type_id tid - if cudf.api.types.is_list_dtype(dtype): + if isinstance(dtype, cudf.ListDtype): tid = libcudf_types.type_id.LIST - elif cudf.api.types.is_struct_dtype(dtype): + elif isinstance(dtype, cudf.StructDtype): tid = libcudf_types.type_id.STRUCT - elif cudf.api.types.is_decimal128_dtype(dtype): + elif isinstance(dtype, cudf.Decimal128Dtype): tid = libcudf_types.type_id.DECIMAL128 - elif cudf.api.types.is_decimal64_dtype(dtype): + elif isinstance(dtype, cudf.Decimal64Dtype): tid = libcudf_types.type_id.DECIMAL64 - elif cudf.api.types.is_decimal32_dtype(dtype): + elif isinstance(dtype, cudf.Decimal32Dtype): tid = libcudf_types.type_id.DECIMAL32 else: tid = ( @@ -259,21 +259,21 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: return libcudf_types.data_type(tid) cpdef dtype_to_pylibcudf_type(dtype): - if cudf.api.types.is_list_dtype(dtype): + if isinstance(dtype, cudf.ListDtype): return pylibcudf.DataType(pylibcudf.TypeId.LIST) - elif cudf.api.types.is_struct_dtype(dtype): + elif isinstance(dtype, cudf.StructDtype): return pylibcudf.DataType(pylibcudf.TypeId.STRUCT) - elif cudf.api.types.is_decimal_dtype(dtype): - if cudf.api.types.is_decimal128_dtype(dtype): - tid = pylibcudf.TypeId.DECIMAL128 - elif cudf.api.types.is_decimal64_dtype(dtype): - tid = pylibcudf.TypeId.DECIMAL64 - else: - tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = pylibcudf.TypeId.DECIMAL128 + return pylibcudf.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + return pylibcudf.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 return pylibcudf.DataType(tid, -dtype.scale) - # libcudf types don't support localization so convert to the base type - if isinstance(dtype, pd.DatetimeTZDtype): + elif isinstance(dtype, pd.DatetimeTZDtype): dtype = np.dtype(f"